From 2e0f9304a715a11c328e1130de8d256c5eefbc2e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 Jul 2023 13:59:29 +0000
Subject: [PATCH 001/641] Fix importing of utils under benchmarks/

---
 xformers/benchmarks/benchmark_indexing.py          | 2 +-
 xformers/benchmarks/benchmark_mem_eff_attention.py | 2 +-
 xformers/benchmarks/benchmark_swiglu.py            | 2 +-
 xformers/benchmarks/benchmark_transformer.py       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/xformers/benchmarks/benchmark_indexing.py b/xformers/benchmarks/benchmark_indexing.py
index cc23901d9..d2416cc8b 100644
--- a/xformers/benchmarks/benchmark_indexing.py
+++ b/xformers/benchmarks/benchmark_indexing.py
@@ -9,7 +9,7 @@
 
 import torch
 from torch.utils import benchmark
-from utils import benchmark_main_helper
+from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops as xops
 
diff --git a/xformers/benchmarks/benchmark_mem_eff_attention.py b/xformers/benchmarks/benchmark_mem_eff_attention.py
index 8e532adf0..9eda00c31 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention.py
@@ -10,7 +10,7 @@
 
 import torch
 from torch.utils import benchmark
-from utils import benchmark_main_helper
+from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops
 import xformers.ops.fmha as fmha
diff --git a/xformers/benchmarks/benchmark_swiglu.py b/xformers/benchmarks/benchmark_swiglu.py
index ffa413a95..fc59ac45d 100644
--- a/xformers/benchmarks/benchmark_swiglu.py
+++ b/xformers/benchmarks/benchmark_swiglu.py
@@ -11,7 +11,7 @@
 
 import torch
 from torch.utils import benchmark
-from utils import benchmark_main_helper
+from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops.swiglu_op as xsw
 
diff --git a/xformers/benchmarks/benchmark_transformer.py b/xformers/benchmarks/benchmark_transformer.py
index a8c077b0d..5260f3f58 100644
--- a/xformers/benchmarks/benchmark_transformer.py
+++ b/xformers/benchmarks/benchmark_transformer.py
@@ -14,7 +14,7 @@
 from timm.models.vision_transformer import Attention as TimmAttention
 from timm.models.vision_transformer import Block as TimmBlock
 from torch.utils import benchmark
-from utils import benchmark_main_helper
+from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops as xops
 

From f35bb4ef0fc3bc9ceb3d4fe5d4849bf68454fb80 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 25 Jul 2023 15:14:42 +0000
Subject: [PATCH 002/641] Add composable_kernel as submodule

---
 .gitmodules                   | 4 ++++
 third_party/composable_kernel | 1 +
 2 files changed, 5 insertions(+)
 create mode 160000 third_party/composable_kernel

diff --git a/.gitmodules b/.gitmodules
index ab23324ae..5634c1e2e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "third_party/cutlass"]
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "third_party/composable_kernel"]
+	path = third_party/composable_kernel
+	url = https://github.com/ROCmSoftwarePlatform/composable_kernel.git
+	branch = mha-train-develop
diff --git a/third_party/composable_kernel b/third_party/composable_kernel
new file mode 160000
index 000000000..34b1c3208
--- /dev/null
+++ b/third_party/composable_kernel
@@ -0,0 +1 @@
+Subproject commit 34b1c32087cd29f856a6d62bb33ba64df36e46a6

From 5fd747085a981df97d4b06c3cbe01f26723494f3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 26 Jul 2023 12:38:52 +0000
Subject: [PATCH 003/641] Update to get_extensions in setup.py to add support
 for integrating rocm C++ codes

---
 setup.py                                      |  44 ++++-
 .../hip_fmha/attention_forward_generic.cpp    | 150 ++++++++++++++++++
 2 files changed, 191 insertions(+), 3 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp

diff --git a/setup.py b/setup.py
index b5741bb1a..9cf6d61f1 100644
--- a/setup.py
+++ b/setup.py
@@ -183,12 +183,22 @@ def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
         )
     ]
 
+def rename_cpp_cu(cpp_files):
+    for entry in cpp_files:
+        shutil.copy(entry, os.path.splitext(entry)[0] + '.cu')
 
 def get_extensions():
     extensions_dir = os.path.join("xformers", "csrc")
 
-    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"), recursive=True)
-    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu"), recursive=True)
+    sources = glob.glob(os.path.join(extensions_dir, "attention", "*.cpp"), recursive=False)
+    sources += glob.glob(os.path.join(extensions_dir, "attention", "autograd", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "attention", "cpu", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "indexing", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "swiglu", "*.cpp"), recursive=True)
+    
+    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"), recursive=False)
+    source_cuda += glob.glob(os.path.join(extensions_dir, "attention", "cuda", "*.cu"), recursive=True)
+    source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "*.cpp"), recursive=True) 
 
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
@@ -258,6 +268,35 @@ def get_extensions():
         ext_modules += get_flash_attention_extensions(
             cuda_version=cuda_version, extra_compile_args=extra_compile_args
         )
+    elif torch.cuda.is_available() and torch.version.hip: 
+       rename_cpp_cu(source_hip)
+       source_hip_cu = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "*.cu"), recursive=True) 
+       extension = CUDAExtension
+       sources += source_hip_cu
+       include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha', 
+                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include',
+                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' ,
+                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'device',
+                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'device' / 'impl',
+                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'element',
+                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'library' / 'include' / 'ck' / 'libary' / 'utility', 
+                       ] 
+       generator_flag = []
+       cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
+       extra_compile_args={
+            "cxx": ["-O3", "-std=c++17"] + generator_flag,
+            "nvcc":
+                [
+                    "-O3",
+                    "-std=c++17",
+                    "--offload-arch=gfx90a",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                ]
+                + generator_flag
+                + cc_flag
+            ,
+       } 
 
     ext_modules.append(
         extension(
@@ -287,7 +326,6 @@ def get_extensions():
         },
     }
 
-
 class clean(distutils.command.clean.clean):  # type: ignore
     def run(self):
         if os.path.exists(".gitignore"):
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
new file mode 100644
index 000000000..388340c10
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+#include <mutex>
+
+#include <ATen/Context.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/Generator.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+#include <torch/library.h>
+
+namespace {
+/*
+  There are 2 modes for using this function.
+  (Mode BMHK) With all the heads having the same seqlen
+  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
+*/
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
+efficient_attention_forward_hip(
+    const at::Tensor& query, // [b, seqlen, num_heads, K]
+    const at::Tensor& key, // [b, seqlen, num_heads, K]
+    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
+    const c10::optional<at::Tensor>& bias, // [b, num_heads, seqlen, seqlen]
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& seqstart_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& seqstart_k,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
+    double dropout_p, // attention matrix dropout probability
+    bool compute_logsumexp,
+    int64_t custom_mask_type,
+    c10::optional<double> scale,
+    const c10::optional<at::Tensor>& seqlen_k) {
+#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD
+  TORCH_CHECK(
+      false,
+      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD");
+#else
+
+  TORCH_CHECK(query.dim() == 4);
+  TORCH_CHECK(key.dim() == 4);
+  TORCH_CHECK(value.dim() == 4);
+
+  // Batch sizes
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // Sequence length
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+
+  int64_t max_seqlen_q, max_seqlen_k;
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    //CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    //CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+    max_seqlen_q = *max_seqlen_q_;
+    max_seqlen_k = 0; // Will be set inside the kernel
+  } else {
+    max_seqlen_q = query.size(1);
+    max_seqlen_k = key.size(1);
+  }
+
+  //CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  //CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  //CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  //at::cuda::CUDAGuard device_guard(query.device());
+  //cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t num_heads = query.size(-2);
+  int64_t K = query.size(-1);
+  int64_t Kv = value.size(-1);
+
+  at::Tensor res;
+  at::Tensor logsumexp;
+
+  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+  at::PhiloxCudaState rng_engine_inputs;
+  if (use_dropout) {
+    at::CUDAGeneratorImpl* gen =
+        at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    // if using dropout, we produce 1 random number for each element of the
+    // attention tensor
+    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+  }
+
+  // uint64_t -> int64_t bitwise casting as PyTorch don't support uint64_t
+  // so just fake it as a int64_t
+  int64_t seed, offset;
+  if (use_dropout) {
+    std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
+    std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
+  }
+
+  return std::make_tuple(res, logsumexp, seed, offset);
+#endif
+}
+
+// For testing in xFormers
+bool is_ck_fmha_available()
+{
+    std::cout << "ck fmha is really here!" << std::endl;
+    return(true); 
+}; 
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_hip"),
+      TORCH_FN(efficient_attention_forward_hip));
+}
+
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available() -> bool"));
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
+      TORCH_FN(is_ck_fmha_available));
+}

From f4079329380433388706d97bc6bb5447b4831be5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 26 Jul 2023 09:02:11 +0000
Subject: [PATCH 004/641] Fix the source collection in setup.py

---
 setup.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 9cf6d61f1..76c0d274e 100644
--- a/setup.py
+++ b/setup.py
@@ -191,13 +191,16 @@ def get_extensions():
     extensions_dir = os.path.join("xformers", "csrc")
 
     sources = glob.glob(os.path.join(extensions_dir, "attention", "*.cpp"), recursive=False)
-    sources += glob.glob(os.path.join(extensions_dir, "attention", "autograd", "*.cpp"), recursive=True)
-    sources += glob.glob(os.path.join(extensions_dir, "attention", "cpu", "*.cpp"), recursive=True)
-    sources += glob.glob(os.path.join(extensions_dir, "indexing", "*.cpp"), recursive=True)
-    sources += glob.glob(os.path.join(extensions_dir, "swiglu", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "attention", "autograd", "**", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "attention", "cpu", "**", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "indexing", "**", "*.cpp"), recursive=True)
+    sources += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cpp"), recursive=True)
     
+    ## avoid the temporary .cu file under xformers/csrc/attention/hip_fmha are included
     source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"), recursive=False)
-    source_cuda += glob.glob(os.path.join(extensions_dir, "attention", "cuda", "*.cu"), recursive=True)
+    source_cuda += glob.glob(os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True)
+    source_cuda += glob.glob(os.path.join(extensions_dir, "indexing", "**", "*.cu"), recursive=True)
+    source_cuda += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True)
     source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "*.cpp"), recursive=True) 
 
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")

From 6303e2a37b69be81ce8c19e7a7dfd39b2fb561ed Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 1 Aug 2023 23:01:27 +0000
Subject: [PATCH 005/641] First C++ addings for ck flash attention successfully
 compiled through CUDAExtentsion

---
 .../hip_fmha/attention_backward_generic.cpp   | 373 ++++++++++++++++
 .../hip_fmha/attention_backward_generic.cu    | 371 ++++++++++++++++
 .../hip_fmha/attention_forward_generic.cpp    | 302 +++++++++++--
 .../hip_fmha/attention_forward_generic.cu     | 400 ++++++++++++++++++
 .../hip_fmha/ck_fmha_batched_backward.h       | 245 +++++++++++
 .../hip_fmha/ck_fmha_batched_forward.h        | 260 ++++++++++++
 .../hip_fmha/ck_fmha_batched_infer.h          | 224 ++++++++++
 .../hip_fmha/ck_fmha_grouped_backward.h       | 246 +++++++++++
 .../hip_fmha/ck_fmha_grouped_forward.h        | 255 +++++++++++
 .../hip_fmha/ck_fmha_grouped_infer.h          | 223 ++++++++++
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 369 ++++++++++++++++
 11 files changed, 3242 insertions(+), 26 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_backward_generic.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_generic.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_util.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
new file mode 100644
index 000000000..9abfe09e8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+
+#include <ATen/Context.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "ck_fmha_batched_backward.h"
+#include "ck_fmha_grouped_backward.h"
+#include "ck_fmha_util.h"
+
+namespace {
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+mem_efficient_attention_backward_hip(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const c10::optional<at::Tensor>& bias, // additive attention bias
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& seqstart_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& seqstart_k,
+    const c10::optional<at::Tensor>& seqlen_k,
+    const at::Tensor& logsumexp,
+    const at::Tensor& out,
+    double dropout_p, // dropout probability
+    int64_t rng_seed, // seed using for generating random numbers for dropout
+    int64_t rng_offset, // offset into random number sequence
+    int64_t custom_mask_type,
+    const c10::optional<double> scale) {
+#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
+  TORCH_CHECK(
+      false,
+      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
+#else
+  at::globalContext().alertNotDeterministic(
+      "mem_efficient_attention_backward_cutlass");
+
+  // ndim
+  TORCH_CHECK(query.dim() == grad_out.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == value.dim());
+  TORCH_CHECK(query.dim() == 4);
+
+  // batch size
+  TORCH_CHECK(query.size(0) == grad_out.size(0));
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // seqlen
+  TORCH_CHECK(key.size(1) == value.size(1));
+  TORCH_CHECK(query.size(1) == grad_out.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) == grad_out.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+  TORCH_CHECK(value.size(3) == grad_out.size(3));
+
+  // handle potentially non-contiguous grad_out through a copy
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
+
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  TORCH_CHECK(
+      !(seqstart_q.has_value() && bias.has_value()),
+      "seqstart_q + bias not supported");
+
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
+  }
+
+  at::cuda::CUDAGuard device_guard(query.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t num_heads = query.size(2);
+  int64_t K = query.size(3);
+  int64_t Kv = value.size(3);
+
+  at::Tensor grad_q, grad_k, grad_v, grad_bias;
+
+  grad_q = at::empty(query.sizes(), query.options());
+  grad_k = at::empty(key.sizes(), key.options());
+  grad_v = at::empty(value.sizes(), value.options());
+
+  at::Tensor randvals;
+
+  at::PhiloxCudaState rng_engine_inputs(rng_seed, rng_offset);
+
+  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.grad_out_ptr = grad_out.data_ptr();
+    p.grad_q_ptr = grad_q.data_ptr();
+    p.grad_k_ptr = grad_k.data_ptr();
+    p.grad_v_ptr = grad_v.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.grad_out_strides = {
+        static_cast<int>(grad_out.stride(0)),
+        static_cast<int>(grad_out.stride(1)),
+        static_cast<int>(grad_out.stride(2)),
+        static_cast<int>(grad_out.stride(3))};
+
+    if (bias.has_value()) {
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.attn_bias_ptr = nullptr;
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    randvals = at::empty(
+        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2)),
+        static_cast<int>(randvals.stride(3))};
+    p.randvals_ptr = randvals.data_ptr();
+
+    p.logsumexp_ptr = logsumexp.data_ptr();
+
+    p.rng_seed = rng_seed;
+    p.rng_offset = rng_offset;
+  };
+
+  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    p.grad_out_strides = {
+        static_cast<int>(grad_out.stride(1)),
+        static_cast<int>(grad_out.stride(2)),
+        static_cast<int>(grad_out.stride(3))};
+
+    if (bias.has_value()) {
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    };
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    randvals = at::empty(
+        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2))};
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
+
+    if (seqlen_k.has_value())
+      p.host_seqlen_k.resize(p.num_batches);
+
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_q.data(),
+        seqstart_q->data_ptr(),
+        (p.num_batches + 1) * sizeof(int),
+        hipMemcpyDeviceToHost));
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_k.data(),
+        seqstart_k->data_ptr(),
+        (p.num_batches + 1) * sizeof(int),
+        hipMemcpyDeviceToHost));
+    if (seqlen_k.has_value())
+      FMHA_HIP_CHECK(hipMemcpy(
+          p.host_seqlen_k.data(),
+          seqlen_k->data_ptr(),
+          p.num_batches * sizeof(int),
+          hipMemcpyDeviceToHost));
+
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
+    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
+
+    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
+
+    char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
+    char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
+    char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
+
+    for (int i = 0; i < p.num_batches; i++) {
+      int32_t tmp_q_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
+      int32_t tmp_k_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
+      int32_t tmp_v_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
+      int32_t tmp_o_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
+      int32_t tmp_grad_o_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.grad_out_strides[0], grad_out.scalar_type());
+      int32_t tmp_logsumexp_stride =
+          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
+      int32_t tmp_randvals_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.randvals_strides[1] +
+              p.host_seqstart_k[i] * p.randvals_strides[2],
+          randvals.scalar_type());
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
+      p.grad_q_ptrs.push_back(reinterpret_cast<void*>(grad_q_ptr));
+
+      q_ptr = q_ptr + tmp_q_stride;
+      grad_q_ptr = grad_q_ptr + tmp_q_stride;
+
+      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
+      p.grad_k_ptrs.push_back(reinterpret_cast<void*>(grad_k_ptr));
+      k_ptr = k_ptr + tmp_k_stride;
+      grad_k_ptr = grad_k_ptr + tmp_k_stride;
+
+      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
+      p.grad_v_ptrs.push_back(reinterpret_cast<void*>(grad_v_ptr));
+      v_ptr = v_ptr + tmp_k_stride;
+      grad_v_ptr = grad_v_ptr + tmp_k_stride;
+
+      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
+      p.grad_out_ptrs.push_back(reinterpret_cast<void*>(grad_out_ptr));
+      out_ptr = out_ptr + tmp_o_stride;
+      grad_out_ptr = grad_out_ptr + tmp_o_stride;
+
+      if (bias.has_value()) {
+        int32_t tmp_bias_stride = get_size_in_bytes(
+            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
+                p.host_seqstart_k[i] * p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
+        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
+      };
+
+      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
+      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
+
+      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
+      randvals_ptr = randvals_ptr + tmp_randvals_stride;
+    }
+  };
+
+  DISPATCH_TYPES(query.scalar_type(), [&]() {
+    if (!seqstart_q.has_value()) { // input is batched
+      BatchedBackwardParams batched_backward_params;
+
+      set_batched_backward_params(batched_backward_params);
+      batched_backward<scalar_t>(batched_backward_params, stream);
+    } else { // input is grouped
+      GroupedBackwardParams grouped_backward_params;
+
+      set_grouped_backward_params(grouped_backward_params);
+      grouped_backward<scalar_t>(grouped_backward_params, stream);
+    }
+  });
+
+  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
+#endif
+} // namespace
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_hip"),
+      TORCH_FN(mem_efficient_attention_backward_hip));
+}
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cu b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cu
new file mode 100644
index 000000000..2756763ce
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cu
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+
+#include <ATen/Context.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "ck_fmha_util.h"
+#include "ck_fmha_batched_backward.h"
+#include "ck_fmha_grouped_backward.h"
+
+namespace {
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+mem_efficient_attention_backward_hip(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const c10::optional<at::Tensor>& bias, // additive attention bias
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& seqstart_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& seqstart_k,
+    const c10::optional<at::Tensor>& seqlen_k,
+    const at::Tensor& logsumexp,
+    const at::Tensor& out,
+    double dropout_p, // dropout probability
+    int64_t rng_seed, // seed using for generating random numbers for dropout
+    int64_t rng_offset, // offset into random number sequence
+    int64_t custom_mask_type,
+    const c10::optional<double> scale) {
+#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
+  TORCH_CHECK(
+      false,
+      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
+#else
+  at::globalContext().alertNotDeterministic(
+      "mem_efficient_attention_backward_cutlass");
+
+  // ndim
+  TORCH_CHECK(query.dim() == grad_out.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == value.dim());
+  TORCH_CHECK(query.dim() == 4);
+
+  // batch size
+  TORCH_CHECK(query.size(0) == grad_out.size(0));
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // seqlen
+  TORCH_CHECK(key.size(1) == value.size(1));
+  TORCH_CHECK(query.size(1) == grad_out.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) == grad_out.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+  TORCH_CHECK(value.size(3) == grad_out.size(3));
+
+  // handle potentially non-contiguous grad_out through a copy
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
+
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  TORCH_CHECK(
+      !(seqstart_q.has_value() && bias.has_value()),
+      "seqstart_q + bias not supported");
+
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
+  }
+
+  at::cuda::CUDAGuard device_guard(query.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t num_heads = query.size(2);
+  int64_t K = query.size(3);
+  int64_t Kv = value.size(3);
+
+  at::Tensor grad_q, grad_k, grad_v, grad_bias;
+
+  grad_q = at::empty(query.sizes(), query.options());
+  grad_k = at::empty(key.sizes(), key.options());
+  grad_v = at::empty(value.sizes(), value.options());
+
+  at::Tensor randvals;
+
+  at::PhiloxCudaState rng_engine_inputs(rng_seed, rng_offset);
+
+  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.grad_out_ptr = grad_out.data_ptr();
+    p.grad_q_ptr = grad_q.data_ptr();
+    p.grad_k_ptr = grad_k.data_ptr();
+    p.grad_v_ptr = grad_v.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.grad_out_strides = {
+        static_cast<int>(grad_out.stride(0)),
+        static_cast<int>(grad_out.stride(1)),
+        static_cast<int>(grad_out.stride(2)),
+        static_cast<int>(grad_out.stride(3))};
+
+    if (bias.has_value()) {
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.attn_bias_ptr = nullptr;
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    randvals = at::empty(
+        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2)),
+        static_cast<int>(randvals.stride(3))};
+    p.randvals_ptr = randvals.data_ptr();
+
+    p.logsumexp_ptr = logsumexp.data_ptr();
+  };
+
+  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    p.grad_out_strides = {
+        static_cast<int>(grad_out.stride(1)),
+        static_cast<int>(grad_out.stride(2)),
+        static_cast<int>(grad_out.stride(3))};
+
+    if (bias.has_value()) {
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    };
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    randvals = at::empty(
+        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2))};
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
+
+    if (seqlen_k.has_value())
+      p.host_seqlen_k.resize(p.num_batches);
+
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_q.data(),
+        seqstart_q->data_ptr(),
+        (p.num_batches + 1) * sizeof(int),
+        hipMemcpyDeviceToHost));
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_k.data(),
+        seqstart_k->data_ptr(),
+        (p.num_batches + 1) * sizeof(int),
+        hipMemcpyDeviceToHost));
+    if (seqlen_k.has_value())
+      FMHA_HIP_CHECK(hipMemcpy(
+          p.host_seqlen_k.data(),
+          seqlen_k->data_ptr(),
+          p.num_batches * sizeof(int),
+          hipMemcpyDeviceToHost));
+
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
+    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
+
+    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
+
+    char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
+    char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
+    char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
+
+    for (int i = 0; i < p.num_batches; i++) {
+      int32_t tmp_q_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
+      int32_t tmp_k_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
+      int32_t tmp_v_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
+      int32_t tmp_o_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
+      int32_t tmp_grad_o_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.grad_out_strides[0],
+          grad_out_.scalar_type());
+      int32_t tmp_logsumexp_stride =
+          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
+      int32_t tmp_randvals_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.randvals_strides[1] +
+              p.host_seqstart_k[i] * p.randvals_strides[2],
+          randvals.scalar_type());
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
+      p.grad_q_ptrs.push_back(reinterpret_cast<void*>(grad_q_ptr));
+
+      q_ptr = q_ptr + tmp_q_stride;
+      grad_q_ptr = grad_q_ptr + tmp_q_stride;
+
+      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
+      p.grad_k_ptrs.push_back(reinterpret_cast<void*>(grad_k_ptr));
+      k_ptr = k_ptr + tmp_k_stride;
+      grad_k_ptr = grad_k_ptr + tmp_k_stride;
+
+      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
+      p.grad_v_ptrs.push_back(reinterpret_cast<void*>(grad_v_ptr));
+      v_ptr = v_ptr + tmp_k_stride;
+      grad_v_ptr = grad_v_ptr + tmp_k_stride;
+
+      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
+      p.grad_out_ptrs.push_back(reinterpret_cast<void*>(grad_out_ptr));
+      out_ptr = out_ptr + tmp_o_stride;
+      grad_out_ptr = grad_out_ptr + tmp_o_stride;
+
+      if (bias.has_value()) {
+        int32_t tmp_bias_stride = get_size_in_bytes(
+            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
+                p.host_seqstart_k[i] * p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
+        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
+      };
+
+      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
+      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
+
+      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
+      randvals_ptr = randvals_ptr + tmp_randvals_stride;
+    }
+  };
+
+  DISPATCH_TYPES(query.scalar_type(), [&]() {
+    if (!seqstart_q.has_value()) { // input is batched
+      BatchedBackwardParams batched_backward_params;
+
+      set_batched_backward_params(batched_backward_params);
+      batched_backward<scalar_t>(batched_backward_params, stream)
+    } else { // input is grouped
+      GroupedBackwardParams grouped_backward_params;
+
+      set_grouped_backward_params(grouped_backward_params);
+      grouped_backward<scalar_t>(grouped_backward_params, stream);
+    }
+  });
+
+  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
+#endif
+} // namespace
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_hip"),
+      TORCH_FN(mem_efficient_attention_backward_hip));
+}
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 388340c10..667d63370 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -18,7 +18,14 @@
 #include <c10/util/Optional.h>
 #include <torch/library.h>
 
+#include "ck_fmha_batched_forward.h"
+#include "ck_fmha_batched_infer.h"
+#include "ck_fmha_grouped_forward.h"
+#include "ck_fmha_grouped_infer.h"
+#include "ck_fmha_util.h"
+
 namespace {
+
 /*
   There are 2 modes for using this function.
   (Mode BMHK) With all the heads having the same seqlen
@@ -67,30 +74,23 @@ efficient_attention_forward_hip(
   // Embedding per head
   TORCH_CHECK(query.size(3) == key.size(3));
 
-  int64_t max_seqlen_q, max_seqlen_k;
   TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
   if (seqstart_q.has_value()) {
     TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
     TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
     TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    //CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
-    //CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
     TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
     TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-    max_seqlen_q = *max_seqlen_q_;
-    max_seqlen_k = 0; // Will be set inside the kernel
-  } else {
-    max_seqlen_q = query.size(1);
-    max_seqlen_k = key.size(1);
-  }
+  };
 
-  //CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  //CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  //CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
 
-  //at::cuda::CUDAGuard device_guard(query.device());
-  //cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
   int64_t M = query.size(1);
@@ -99,8 +99,9 @@ efficient_attention_forward_hip(
   int64_t K = query.size(-1);
   int64_t Kv = value.size(-1);
 
-  at::Tensor res;
+  at::Tensor out;
   at::Tensor logsumexp;
+  at::Tensor randvals;
 
   const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
   at::PhiloxCudaState rng_engine_inputs;
@@ -115,24 +116,273 @@ efficient_attention_forward_hip(
     rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
   }
 
+  auto set_batched_infer_params = [&](BatchedInferParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.attn_bias_ptr = nullptr;
+
+    p.custom_mask_type = custom_mask_type;
+  };
+
+  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+    set_batched_infer_params(p);
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    randvals = at::empty(
+        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2)),
+        static_cast<int>(randvals.stride(3))};
+    p.randvals_ptr = randvals.data_ptr();
+
+    logsumexp = at::empty(
+        {B, num_heads, M}, query.options().dtype(at::ScalarType::Float));
+    p.logsumexp_ptr = logsumexp.data_ptr();
+  };
+
+  auto set_grouped_infer_params = [&](GroupedInferParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    };
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
+
+    if (seqlen_k.has_value())
+      p.host_seqlen_k.resize(p.num_batches);
+
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_q.data(),
+        seqstart_q->data_ptr(),
+        (p.num_batches + 1) * sizeof(int32_t),
+        hipMemcpyDeviceToHost));
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_k.data(),
+        seqstart_k->data_ptr(),
+        (p.num_batches + 1) * sizeof(int32_t),
+        hipMemcpyDeviceToHost));
+    if (seqlen_k.has_value())
+      FMHA_HIP_CHECK(hipMemcpy(
+          p.host_seqlen_k.data(),
+          seqlen_k->data_ptr(),
+          p.num_batches * sizeof(int32_t),
+          hipMemcpyDeviceToHost));
+
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
+
+    for (int i = 0; i < p.num_batches; i++) {
+      int32_t tmp_q_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
+      int32_t tmp_k_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
+      int32_t tmp_v_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
+      int32_t tmp_o_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
+      q_ptr = q_ptr + tmp_q_stride;
+
+      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
+      k_ptr = k_ptr + tmp_k_stride;
+
+      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
+      v_ptr = v_ptr + tmp_k_stride;
+
+      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
+      out_ptr = out_ptr + tmp_o_stride;
+
+      if (bias.has_value()) {
+        int32_t tmp_bias_stride = get_size_in_bytes(
+            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
+                p.host_seqstart_k[i] * p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
+        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
+      };
+    }
+  };
+
+  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+    set_grouped_infer_params(p);
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    logsumexp =
+        at::empty({num_heads, M}, query.options().dtype(at::ScalarType::Float));
+
+    randvals = at::empty(
+        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2))};
+
+    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
+
+    for (int i = 0; i < p.num_batches; i++) {
+      int32_t tmp_logsumexp_stride =
+          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
+      int32_t tmp_randvals_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.randvals_strides[1] +
+              p.host_seqstart_k[i] * p.randvals_strides[2],
+          randvals.scalar_type());
+
+      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
+      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
+
+      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
+      randvals_ptr = randvals_ptr + tmp_randvals_stride;
+    };
+  };
+
   // uint64_t -> int64_t bitwise casting as PyTorch don't support uint64_t
   // so just fake it as a int64_t
   int64_t seed, offset;
-  if (use_dropout) {
-    std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
-    std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
-  }
 
-  return std::make_tuple(res, logsumexp, seed, offset);
+  DISPATCH_TYPES(query.scalar_type(), [&]() {
+    out = at::empty(
+        {B, M, num_heads, Kv},
+        query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
+
+    if (!use_dropout && !compute_logsumexp) { // work is inference
+      if (!seqstart_q.has_value()) { // input is batched
+        BatchedInferParams batched_infer_params;
+
+        set_batched_infer_params(batched_infer_params);
+        batched_infer<scalar_t>(batched_infer_params, stream);
+      } else { // input is grouped
+        GroupedInferParams grouped_infer_params;
+
+        set_grouped_infer_params(grouped_infer_params);
+        grouped_infer<scalar_t>(grouped_infer_params, stream);
+      }
+    } else { // work is training forward
+      if (!seqstart_q.has_value()) { // input is batched
+        BatchedForwardParams batched_forward_params;
+
+        set_batched_forward_params(batched_forward_params);
+        batched_forward<scalar_t>(batched_forward_params, stream);
+      } else { // input is grouped
+        GroupedForwardParams grouped_forward_params;
+
+        set_grouped_forward_params(grouped_forward_params);
+        grouped_forward<scalar_t>(grouped_forward_params, stream);
+      }
+
+      std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
+      std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
+    }
+  });
+
+  return std::make_tuple(out, logsumexp, seed, offset);
 #endif
 }
 
 // For testing in xFormers
-bool is_ck_fmha_available()
-{
-    std::cout << "ck fmha is really here!" << std::endl;
-    return(true); 
-}; 
+bool is_ck_fmha_available() {
+  std::cout << "ck fmha is really here!" << std::endl;
+  return (true);
+};
 
 } // namespace
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cu b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cu
new file mode 100644
index 000000000..d951dbcbf
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cu
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+#include <mutex>
+
+#include <ATen/Context.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/Generator.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+#include <torch/library.h>
+
+#include "ck_fmha_util.h"
+#include "ck_fmha_batched_infer.h"
+#include "ck_fmha_batched_forward.h"
+#include "ck_fmha_grouped_infer.h"
+#include "ck_fmha_grouped_forward.h"
+
+namespace {
+
+/*
+  There are 2 modes for using this function.
+  (Mode BMHK) With all the heads having the same seqlen
+  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
+*/
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
+efficient_attention_forward_hip(
+    const at::Tensor& query, // [b, seqlen, num_heads, K]
+    const at::Tensor& key, // [b, seqlen, num_heads, K]
+    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
+    const c10::optional<at::Tensor>& bias, // [b, num_heads, seqlen, seqlen]
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& seqstart_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& seqstart_k,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
+    double dropout_p, // attention matrix dropout probability
+    bool compute_logsumexp,
+    int64_t custom_mask_type,
+    c10::optional<double> scale,
+    const c10::optional<at::Tensor>& seqlen_k) {
+#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD
+  TORCH_CHECK(
+      false,
+      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD");
+#else
+
+  TORCH_CHECK(query.dim() == 4);
+  TORCH_CHECK(key.dim() == 4);
+  TORCH_CHECK(value.dim() == 4);
+
+  // Batch sizes
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // Sequence length
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+  };
+
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t num_heads = query.size(-2);
+  int64_t K = query.size(-1);
+  int64_t Kv = value.size(-1);
+
+  at::Tensor out;
+  at::Tensor logsumexp;
+  at::Tensor randvals;
+
+  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+  at::PhiloxCudaState rng_engine_inputs;
+  if (use_dropout) {
+    at::CUDAGeneratorImpl* gen =
+        at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    // if using dropout, we produce 1 random number for each element of the
+    // attention tensor
+    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+  }
+
+  auto set_batched_infer_params = [&](BatchedInferParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.attn_bias_ptr = nullptr;
+
+    p.custom_mask_type = custom_mask_type;
+  };
+
+  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+    set_batched_infer_params(p);
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    randvals = at::empty(
+        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2)),
+        static_cast<int>(randvals.stride(3))};
+    p.randvals_ptr = randvals.data_ptr();
+
+    logsumexp = at::empty(
+        {B, num_heads, M}, query.options().dtype(at::ScalarType::Float));
+    p.logsumexp_ptr = logsumexp.data_ptr();
+  };
+
+  auto set_grouped_infer_params = [&](GroupedInferParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.num_heads = num_heads;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      const at::Tensor bias_4d_view =
+          get_bias_4d_view(*bias, B, num_heads, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    };
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
+
+    if (seqlen_k.has_value())
+      p.host_seqlen_k.resize(p.num_batches);
+
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_q.data(),
+        seqstart_q->data_ptr(),
+        (p.num_batches + 1) * sizeof(int32_t),
+        hipMemcpyDeviceToHost));
+    FMHA_HIP_CHECK(hipMemcpy(
+        p.host_seqstart_k.data(),
+        seqstart_k->data_ptr(),
+        (p.num_batches + 1) * sizeof(int32_t),
+        hipMemcpyDeviceToHost));
+    if (seqlen_k.has_value())
+      FMHA_HIP_CHECK(hipMemcpy(
+          p.host_seqlen_k.data(),
+          seqlen_k->data_ptr(),
+          p.num_batches * sizeof(int32_t),
+          hipMemcpyDeviceToHost));
+
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
+
+    for (int i = 0; i < p.num_batches; i++) {
+      int32_t tmp_q_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
+      int32_t tmp_k_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
+      int32_t tmp_v_stride = get_size_in_bytes(
+          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
+      int32_t tmp_o_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
+      q_ptr = q_ptr + tmp_q_stride;
+
+      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
+      k_ptr = k_ptr + tmp_k_stride;
+
+      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
+      v_ptr = v_ptr + tmp_k_stride;
+
+      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
+      out_ptr = out_ptr + tmp_o_stride;
+
+      if (bias.has_value()) {
+        int32_t tmp_bias_stride = get_size_in_bytes(
+            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
+                p.host_seqstart_k[i] * p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
+        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
+      };
+    }
+  };
+
+  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+    set_grouped_infer_params(p);
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.rng_engine_inputs = rng_engine_inputs;
+
+    logsumexp =
+        at::empty({num_heads, M}, query.options().dtype(at::ScalarType::Float));
+
+    randvals = at::empty(
+        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+    p.randvals_strides = {
+        static_cast<int>(randvals.stride(0)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(2))};
+
+    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
+
+    for (int i = 0; i < p.num_batches; i++) {
+      int32_t tmp_logsumexp_stride =
+          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
+      int32_t tmp_randvals_stride = get_size_in_bytes(
+          p.host_seqstart_q[i] * p.randvals_strides[1] +
+              p.host_seqstart_k[i] * p.randvals_strides[2],
+          randvals.scalar_type());
+
+      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
+      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
+
+      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
+      randvals_ptr = randvals_ptr + tmp_randvals_stride;
+    };
+  };
+
+  // uint64_t -> int64_t bitwise casting as PyTorch don't support uint64_t
+  // so just fake it as a int64_t
+  int64_t seed, offset;
+
+  DISPATCH_TYPES(query.scalar_type(),  [&]() {
+    out = at::empty(
+        {B, M, num_heads, Kv},
+        query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
+
+    if (!use_dropout && !compute_logsumexp) { // work is inference
+      if (!seqstart_q.has_value()) { // input is batched
+        BatchedInferParams batched_infer_params;
+
+        set_batched_infer_params(batched_infer_params);
+        batched_infer<scalar_t>(batched_infer_params, stream);
+      } else { // input is grouped
+        GroupedInferParams grouped_infer_params;
+
+        set_grouped_infer_params(grouped_infer_params);
+        grouped_infer<scalar_t>(grouped_infer_params, stream);
+      }
+    } else { // work is training forward
+      if (!seqstart_q.has_value()) { // input is batched
+        BatchedForwardParams batched_forward_params;
+
+        set_batched_forward_params(batched_forward_params);
+        batched_forward<scalar_t>(batched_forward_params, stream)
+      } else { // input is grouped
+        GroupedForwardParams grouped_forward_params;
+
+        set_grouped_forward_params(grouped_forward_params);
+        grouped_forward<scalar_t>(grouped_forward_params, stream);
+      }
+
+      std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
+      std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
+    }
+  });
+
+  return std::make_tuple(out, logsumexp, seed, offset);
+#endif
+}
+
+// For testing in xFormers
+bool is_ck_fmha_available() {
+  std::cout << "ck fmha is really here!" << std::endl;
+  return (true);
+};
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_hip"),
+      TORCH_FN(efficient_attention_forward_hip));
+}
+
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available() -> bool"));
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
+      TORCH_FN(is_ck_fmha_available));
+}
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
new file mode 100644
index 000000000..34969a513
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -0,0 +1,245 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+
+#include "ck_fmha_util.h"
+
+template <typename scalar_t, int32_t custom_mask_type>
+void batched_backward_mask_type_dispatched(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+
+template <typename scalar_t>
+void batched_backward(BatchedBackwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    batched_backward_mask_type_dispatched<scalar_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    batched_backward_mask_type_dispatched<scalar_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    batched_backward_mask_type_dispatched<scalar_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
+
+template <typename scalar_t, int32_t custom_mask_type = 0>
+void batched_backward_mask_type_dispatched(
+    BatchedBackwardParams& param,
+    hipStream_t stream) {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+  using Scale = ck::tensor_operation::element_wise::Scale;
+
+  using QKVElementOp = PassThrough;
+  using YElementOp = PassThrough;
+
+  using InputDataType = scalar_t;
+  using OutputDataType = scalar_t;
+  using GemmDataType = scalar_t;
+  using AccDataType = F32;
+  using ShuffleDataType = F32;
+  using LSEDataType = F32;
+  using ZDataType = unsigned short;
+  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc1BiasDataType = ck::Tuple<>;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock =
+      MaxVectorSizeForType<scalar_t>::value;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecQ =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecK =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecV =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecY =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr bool Deterministic = false;
+
+  using DeviceOpInstance = ck::tensor_operation::device::
+      DeviceBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          InputDataType,
+          OutputDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          ShuffleDataType,
+          QKVElementOp,
+          QKVElementOp,
+          Scale,
+          QKVElementOp,
+          YElementOp,
+          GemmSpec,
+          TensorSpecQ,
+          TensorSpecK,
+          TensorSpecV,
+          TensorSpecY,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          64, // KPerBlock
+          64, // Gemm1NPerBlock
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          2, // Gemm1NXdlPerWave
+          2, // Gemm2NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          1, // CShuffleMXdlPerWavePerShuffle
+          2, // CShuffleNXdlPerWavePerShuffle
+          S<1,
+            32,
+            1,
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          MaskingSpec, // MaskingSpecialization
+          Deterministic>;
+
+  std::vector<ck::index_t> q_gs_ms_ks_lengths{
+      param.B, param.num_heads, param.M, param.K};
+  std::vector<ck::index_t> q_gs_ms_ks_strides{
+      param.q_strides[0],
+      param.q_strides[2],
+      param.q_strides[1],
+      param.q_strides[3]};
+
+  std::vector<ck::index_t> k_gs_ns_ks_lengths{
+      param.B, param.num_heads, param.N, param.K};
+  std::vector<ck::index_t> k_gs_ns_ks_strides{
+      param.k_strides[0],
+      param.k_strides[2],
+      param.k_strides[1],
+      param.k_strides[3]};
+
+  std::vector<ck::index_t> v_gs_os_ns_lengths{
+      param.B, param.num_heads, param.Kv, param.N};
+  std::vector<ck::index_t> v_gs_os_ns_strides{
+      param.v_strides[0],
+      param.v_strides[2],
+      param.v_strides[3],
+      param.v_strides[1]};
+
+  std::vector<ck::index_t> y_gs_ms_os_lengths{
+      param.B, param.num_heads, param.M, param.Kv};
+  std::vector<ck::index_t> y_gs_ms_os_strides{
+      param.out_strides[0],
+      param.out_strides[2],
+      param.out_strides[1],
+      param.out_strides[3]};
+
+  std::vector<ck::index_t> ygrad_gs_ms_os_lengths{
+      param.B, param.num_heads, param.M, param.Kv};
+
+  std::vector<ck::index_t> z_gs_ms_ns_lengths{
+      param.B, param.num_heads, param.M, param.N};
+  std::vector<ck::index_t> z_gs_ms_ns_strides{
+      param.randvals_strides[0],
+      param.randvals_strides[1],
+      param.randvals_strides[2],
+      param.randvals_strides[3]};
+
+  std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
+
+  float alpha = 1.f / std::sqrt(param.K);
+
+  auto op = DeviceOpInstance{};
+  auto invoker = op.MakeInvoker();
+
+  auto arg_ptr = op.MakeArgumentPointer(
+      param.q_ptr,
+      param.k_ptr,
+      param.randvals_ptr,
+      param.v_ptr,
+      param.out_ptr,
+      param.logsumexp_ptr,
+      param.grad_out_ptr,
+      param.grad_q_ptr,
+      param.grad_k_ptr,
+      param.grad_v_ptr,
+      {}, // std::array<void*, 1> p_acc0_biases;
+      {}, // std::array<void*, 1> p_acc1_biases;
+      q_gs_ms_ks_lengths,
+      q_gs_ms_ks_strides,
+      k_gs_ns_ks_lengths,
+      k_gs_ns_ks_strides,
+      z_gs_ms_ns_lengths,
+      z_gs_ms_ns_strides,
+      v_gs_os_ns_lengths,
+      v_gs_os_ns_strides,
+      y_gs_ms_os_lengths,
+      y_gs_ms_os_strides,
+      lse_gs_ms_lengths,
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc0_biases_gs_ms_ns_lengths},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc0_biases_gs_ms_ns_strides},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc1_biases_gs_ms_os_lengths},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc1_biases_gs_ms_os_strides},
+      QKVElementOp{},
+      QKVElementOp{},
+      Scale{alpha},
+      QKVElementOp{},
+      YElementOp{},
+      param.dropout_prob,
+      std::tuple<unsigned long long, unsigned long long>(
+          param.rng_seed, param.rng_offset));
+
+  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+  if (!op.IsSupportedArgument(arg_ptr.get())) {
+    std::ostringstream ostr;
+    ostr << op.GetTypeString() << " does not support this problem";
+
+    throw std::runtime_error(ostr.str());
+  }
+
+  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
new file mode 100644
index 000000000..f2f551ac7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v1.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+
+#include "ck_fmha_util.h"
+
+template <typename scalar_t, int32_t custom_mask_type>
+void batched_forward_mask_type_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+
+template <typename scalar_t>
+void batched_forward(BatchedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    batched_forward_mask_type_dispatched<scalar_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    batched_forward_mask_type_dispatched<scalar_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    batched_forward_mask_type_dispatched<scalar_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
+
+template <typename scalar_t, int32_t custom_mask_type = 0>
+void batched_forward_mask_type_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc1BiasDataType = ck::Tuple<>;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr bool Deterministic = false;
+
+  using DeviceOpInstance = ck::tensor_operation::device::
+      DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V1<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          32, // Gemm1NPerBlock
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          1, // Gemm1NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<16, 16, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          2,
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          1, // CShuffleNXdlPerWavePerShuffle
+          S<1,
+            64,
+            1,
+            4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          MaskingSpec, // MaskingSpecialization
+          Deterministic>;
+
+  float p_dropout = 1 - param.dropout_prob;
+  ZDataType p_dropout_in_16bits = ZDataType(std::floor(p_dropout * 65535.0));
+  float rp_dropout = 1.0 / p_dropout;
+  float alpha = 1.f / std::sqrt(param.K);
+
+  std::vector<ck::index_t> a_gs_ms_ks_lengths{
+      param.B, param.num_heads, param.M, param.K};
+  std::vector<ck::index_t> a_gs_ms_ks_strides{
+      param.q_strides[0],
+      param.q_strides[2],
+      param.q_strides[1],
+      param.q_strides[3]};
+
+  std::vector<ck::index_t> b0_gs_ns_ks_lengths{
+      param.B, param.num_heads, param.N, param.K};
+  std::vector<ck::index_t> b0_gs_ns_ks_strides{
+      param.k_strides[0],
+      param.k_strides[2],
+      param.k_strides[1],
+      param.k_strides[3]};
+
+  // to be changed to b1_gs_ns_os_lengths
+  std::vector<ck::index_t> b1_gs_os_ns_lengths{
+      param.B, param.num_heads, param.N, param.Kv};
+  std::vector<ck::index_t> b1_gs_os_ns_strides{
+      param.v_strides[0],
+      param.v_strides[2],
+      param.v_strides[3],
+      param.v_strides[1]};
+
+  std::vector<ck::index_t> c_gs_ms_os_lengths{
+      param.B, param.num_heads, param.M, param.Kv};
+  std::vector<ck::index_t> c_gs_ms_os_strides{
+      param.out_strides[0],
+      param.out_strides[2],
+      param.out_strides[1],
+      param.out_strides[3]};
+
+  std::vector<ck::index_t> z_gs_ms_ns_lengths{
+      param.B, param.num_heads, param.M, param.N};
+  std::vector<ck::index_t> z_gs_ms_ns_strides{
+      param.randvals_strides[0],
+      param.randvals_strides[1],
+      param.randvals_strides[2],
+      param.randvals_strides[3]};
+
+  std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
+
+  auto a_element_op = AElementOp{};
+  auto b0_element_op = B0ElementOp{};
+  auto acc0_element_op = Acc0ElementOp{alpha};
+  auto b1_element_op = B1ElementOp{};
+  auto c_element_op = CElementOp{};
+
+  // TODO, how to initialize seed, offset
+  const uint64_t seed = 1;
+  const uint64_t offset = 0;
+
+  auto op = DeviceOpInstance{};
+  auto invoker = op.MakeInvoker();
+  auto arg_ptr = op.MakeArgumentPointer(
+      param.q_ptr,
+      param.k_ptr,
+      param.v_ptr,
+      param.out_ptr,
+      param.randvals_ptr,
+      param.logsumexp_ptr,
+      {}, // std::array<void*, 1> p_acc0_biases;
+      {}, // std::array<void*, 1> p_acc1_biases;
+      a_gs_ms_ks_lengths,
+      a_gs_ms_ks_strides,
+      b0_gs_ns_ks_lengths,
+      b0_gs_ns_ks_strides,
+      b1_gs_os_ns_lengths,
+      b1_gs_os_ns_strides,
+      c_gs_ms_os_lengths,
+      c_gs_ms_os_strides,
+      z_gs_ms_ns_lengths,
+      z_gs_ms_ns_strides,
+      lse_gs_ms_lengths,
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc0_biases_gs_ms_ns_lengths},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc0_biases_gs_ms_ns_strides},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc1_biases_gs_ms_os_lengths},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc1_biases_gs_ms_os_strides},
+      a_element_op,
+      b0_element_op,
+      acc0_element_op,
+      b1_element_op,
+      c_element_op,
+      param.dropout_prob, // dropout ratio
+      {seed, offset}); // dropout random seed and offset, offset should be at
+                       // least the number of elements on a thread
+  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+  if (!op.IsSupportedArgument(arg_ptr.get())) {
+    std::ostringstream ostr;
+
+    ostr << op.GetTypeString() << " does not support this problem";
+
+    throw std::runtime_error(ostr.str());
+  }
+
+  invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
new file mode 100644
index 000000000..cc8129a80
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -0,0 +1,224 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+
+#include "ck_fmha_util.h"
+
+template <typename scalar_t, int32_t custom_mask_type>
+void batched_infer_mask_type_dispatched(
+    BatchedInferParams& param,
+    hipStream_t stream);
+
+template <typename scalar_t>
+void batched_infer(BatchedInferParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    batched_infer_mask_type_dispatched<scalar_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    batched_infer_mask_type_dispatched<scalar_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    batched_infer_mask_type_dispatched<scalar_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
+
+template <typename scalar_t, int32_t custom_mask_type = 0>
+void batched_infer_mask_type_dispatched(
+    BatchedInferParams& param,
+    hipStream_t stream) {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc1BiasDataType = ck::Tuple<>;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+
+  using DeviceOpInstance = ck::tensor_operation::device::
+      DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          64, // Gemm1NPerBlock
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          2, // Gemm1NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<16, 16, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          4,
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          2, // CShuffleNXdlPerWavePerShuffle
+          S<1,
+            32,
+            1,
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          MaskingSpec>; // MaskingSpecialization
+
+  std::vector<ck::index_t> a_gs_ms_ks_lengths{
+      param.B, param.num_heads, param.M, param.K};
+  std::vector<ck::index_t> a_gs_ms_ks_strides{
+      param.q_strides[0],
+      param.q_strides[2],
+      param.q_strides[1],
+      param.q_strides[3]};
+  std::vector<ck::index_t> b0_gs_ns_ks_lengths{
+      param.B, param.num_heads, param.N, param.K};
+  std::vector<ck::index_t> b0_gs_ns_ks_strides{
+      param.k_strides[0],
+      param.k_strides[2],
+      param.k_strides[1],
+      param.k_strides[3]};
+
+  // to be changed to b1_gs_ns_os_lengths
+  std::vector<ck::index_t> b1_gs_os_ns_lengths{
+      param.B, param.num_heads, param.N, param.Kv};
+  std::vector<ck::index_t> b1_gs_os_ns_strides{
+      param.v_strides[0],
+      param.v_strides[2],
+      param.v_strides[3],
+      param.v_strides[1]};
+
+  std::vector<ck::index_t> c_gs_ms_os_lengths{
+      param.B, param.num_heads, param.M, param.Kv};
+  std::vector<ck::index_t> c_gs_ms_os_strides{
+      param.out_strides[0],
+      param.out_strides[2],
+      param.out_strides[1],
+      param.out_strides[3]};
+
+  auto a_element_op = AElementOp{};
+  auto b0_element_op = B0ElementOp{};
+  auto acc0_element_op = Acc0ElementOp{1.0f};
+  auto b1_element_op = B1ElementOp{};
+  auto c_element_op = CElementOp{};
+
+  auto op = DeviceOpInstance{};
+  auto invoker = op.MakeInvoker();
+  auto arg_ptr = op.MakeArgumentPointer(
+      param.q_ptr,
+      param.k_ptr,
+      param.v_ptr,
+      param.out_ptr,
+      {}, // std::array<void*, 1> p_acc0_biases;
+      {}, // std::array<void*, 1> p_acc1_biases;
+      a_gs_ms_ks_lengths,
+      a_gs_ms_ks_strides,
+      b0_gs_ns_ks_lengths,
+      b0_gs_ns_ks_strides,
+      b1_gs_os_ns_lengths,
+      b1_gs_os_ns_strides,
+      c_gs_ms_os_lengths,
+      c_gs_ms_os_strides,
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc0_biases_gs_ms_ns_lengths},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc0_biases_gs_ms_ns_strides},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc1_biases_gs_ms_os_lengths},
+      {}, // std::array<std::vector<ck::index_t>,
+          // 1>{acc1_biases_gs_ms_os_strides},
+      a_element_op,
+      b0_element_op,
+      acc0_element_op,
+      b1_element_op,
+      c_element_op);
+
+  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+  if (!op.IsSupportedArgument(arg_ptr.get())) {
+    std::ostringstream ostr;
+    ostr << op.GetTypeString() << " does not support this problem";
+
+    throw std::runtime_error(ostr.str());
+  }
+
+  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
new file mode 100644
index 000000000..fb4879fc0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -0,0 +1,246 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v1.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v2.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/sequence.hpp>
+
+#include "ck_fmha_util.h"
+
+template <typename scalar_t, int32_t custom_mask_type>
+void grouped_backward_mask_type_dispatched(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
+
+template <typename scalar_t>
+void grouped_backward(GroupedBackwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    grouped_backward_mask_type_dispatched<scalar_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    grouped_backward_mask_type_dispatched<scalar_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    grouped_backward_mask_type_dispatched<scalar_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
+
+template <typename scalar_t, int32_t custom_mask_type = 0>
+void grouped_backward_mask_type_dispatched(
+    GroupedBackwardParams& param,
+    hipStream_t stream) {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+  using Scale = ck::tensor_operation::element_wise::Scale;
+
+  using QKVElementOp = PassThrough;
+  using YElementOp = PassThrough;
+
+  using InputDataType = scalar_t;
+  using OutputDataType = scalar_t;
+  using GemmDataType = scalar_t;
+  using AccDataType = F32;
+  using ShuffleDataType = F32;
+  using LSEDataType = F32;
+  using ZDataType = unsigned short;
+  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc1BiasDataType = ck::Tuple<>;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock =
+      MaxVectorSizeForType<scalar_t>::value;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecQ =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecK =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecV =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecY =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr bool Deterministic = false;
+
+  using DeviceOpInstance = ck::tensor_operation::device::
+      DeviceGroupedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          InputDataType,
+          OutputDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          ShuffleDataType,
+          QKVElementOp,
+          QKVElementOp,
+          Scale,
+          QKVElementOp,
+          YElementOp,
+          GemmSpec,
+          TensorSpecQ,
+          TensorSpecK,
+          TensorSpecV,
+          TensorSpecY,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          64, // KPerBlock
+          64, // Gemm1NPerBlock
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          2, // Gemm1NXdlPerWave
+          2, // Gemm2NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          1, // CShuffleMXdlPerWavePerShuffle
+          2, // CShuffleNXdlPerWavePerShuffle
+          S<1,
+            32,
+            1,
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          MaskingSpec, // MaskingSpecialization
+          Deterministic>;
+
+  std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+  for (std::size_t i = 0; i < param.num_batches; i++) {
+    int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
+    int N = param.host_seqstart_k.empty()
+        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+        : param.host_seqlen_k[i];
+    int K = param.K;
+    int Kv = param.Kv;
+    int G1 = param.num_heads;
+
+    std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1, M, K};
+    std::vector<ck::index_t> q_gs_ms_ks_strides{
+        0, param.q_strides[0], param.q_strides[1], param.q_strides[2]};
+
+    std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1, N, K};
+    std::vector<ck::index_t> k_gs_ns_ks_strides{
+        0, param.k_strides[0], param.k_strides[1], param.k_strides[2]};
+
+    // to be changed to v_gs_ns_os_lengths
+    std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1, Kv, N};
+    std::vector<ck::index_t> v_gs_os_ns_strides{
+        0, param.v_strides[0], param.v_strides[2], param.v_strides[1]};
+
+    std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1, M, Kv};
+    std::vector<ck::index_t> y_gs_ms_os_strides{
+        0, param.out_strides[0], param.out_strides[1], param.out_strides[2]};
+
+    std::vector<ck::index_t> z_gs_ms_ns_lengths{1, G1, M, N};
+    std::vector<ck::index_t> z_gs_ms_ns_strides{
+        0,
+        param.randvals_strides[0],
+        param.randvals_strides[1],
+        param.randvals_strides[2]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+    std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
+
+    problem_descs.push_back({
+        q_gs_ms_ks_lengths,
+        q_gs_ms_ks_strides,
+        k_gs_ns_ks_lengths,
+        k_gs_ns_ks_strides,
+        z_gs_ms_ns_lengths,
+        z_gs_ms_ns_strides,
+        v_gs_os_ns_lengths,
+        v_gs_os_ns_strides,
+        y_gs_ms_os_lengths,
+        y_gs_ms_os_strides,
+        lse_gs_ms_lengths,
+        lse_gs_ms_strides,
+        {}, // std::array<std::vector<ck::index_t>,
+            // 1>{acc0_biases_gs_ms_ns_lengths},
+        {}, // std::array<std::vector<ck::index_t>,
+            // 1>{acc0_biases_gs_ms_ns_strides},
+        {}, // std::array<std::vector<ck::index_t>,
+            // 1>{acc1_biases_gs_ms_os_lengths},
+        {}, // std::array<std::vector<ck::index_t>,
+            // 1>{acc1_biases_gs_ms_os_strides},
+    });
+  }
+
+  float alpha = 1.0f / std::sqrt(param.K);
+
+  auto op = DeviceOpInstance{};
+  auto invoker = op.MakeInvoker();
+
+  auto arg_ptr = op.MakeArgumentPointer(
+      param.q_ptrs,
+      param.k_ptrs,
+      param.randvals_ptrs,
+      param.v_ptrs,
+      param.out_ptrs,
+      param.logsumexp_ptrs,
+      param.grad_out_ptrs,
+      param.grad_q_ptrs,
+      param.grad_k_ptrs,
+      param.grad_v_ptrs,
+      {}, // std::array<void*, 1> p_acc0_biases;
+      {}, // std::array<void*, 1> p_acc1_biases;
+      problem_descs,
+      QKVElementOp{},
+      QKVElementOp{},
+      Scale{alpha},
+      QKVElementOp{},
+      YElementOp{},
+      param.dropout_prob,
+      std::tuple<unsigned long long, unsigned long long>(
+          param.rng_seed, param.rng_offset));
+
+  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+  if (!op.IsSupportedArgument(arg_ptr.get())) {
+    std::ostringstream ostr;
+    ostr << op.GetTypeString() << " does not support this problem";
+
+    throw std::runtime_error(ostr.str());
+  }
+
+  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
new file mode 100644
index 000000000..d7b980f00
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -0,0 +1,255 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/sequence.hpp>
+
+#include "ck_fmha_util.h"
+
+template <typename scalar_t, int32_t custom_mask_type>
+void grouped_forward_mask_type_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+
+template <typename scalar_t>
+void grouped_forward(GroupedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    grouped_forward_mask_type_dispatched<scalar_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    grouped_forward_mask_type_dispatched<scalar_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    grouped_forward_mask_type_dispatched<scalar_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
+
+template <typename scalar_t, int32_t custom_mask_type = 0>
+void grouped_forward_mask_type_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc1BiasDataType = ck::Tuple<>;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr bool Deterministic = true;
+
+  using DeviceOpInstance = ck::tensor_operation::device::
+      DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V1<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          32, // Gemm1NPerBlock
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          1, // Gemm1NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<16, 16, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          2,
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          1, // CShuffleNXdlPerWavePerShuffle
+          S<1,
+            64,
+            1,
+            4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          MaskingSpec, // MaskingSpecialization
+          Deterministic>;
+
+  std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+  for (std::size_t i = 0; i < param.num_batches; i++) {
+    int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+    int N = param.host_seqlen_k.empty()
+        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+        : param.host_seqlen_k[i];
+    int K = param.K;
+    int Kv = param.Kv;
+    int G1 = param.num_heads;
+
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+    // to be changed to b1_gs_ns_os_lengths
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1, Kv, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1, M, Kv};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+    std::vector<ck::index_t> z_gs_ms_ns_lengths{1, G1, M, N};
+    std::vector<ck::index_t> z_gs_ms_ns_strides{
+        0,
+        param.randvals_strides[0],
+        param.randvals_strides[1],
+        param.randvals_strides[2]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+    std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
+
+    problem_descs.push_back(
+        {a_gs_ms_ks_lengths,
+         a_gs_ms_ks_strides,
+         b0_gs_ns_ks_lengths,
+         b0_gs_ns_ks_strides,
+         b1_gs_os_ns_lengths,
+         b1_gs_os_ns_strides,
+         c_gs_ms_os_lengths,
+         c_gs_ms_os_strides,
+         z_gs_ms_ns_lengths,
+         z_gs_ms_ns_strides,
+         lse_gs_ms_lengths,
+         lse_gs_ms_strides,
+         {}, // acc0_biases_gs_ms_ns_lengths
+         {}, // acc0_biases_gs_ms_ns_strides
+         {}, // acc1_biases_gs_ms_os_lengths
+         {}}); // acc1_biases_gs_ms_os_strides
+  }
+
+  // TODO, how to initialize seed, offset
+  const uint64_t seed = 1;
+  const uint64_t offset = 0;
+
+  float alpha = 1.0f;
+
+  auto a_element_op = AElementOp{};
+  auto b0_element_op = B0ElementOp{};
+  auto acc0_element_op = Acc0ElementOp{alpha};
+  auto b1_element_op = B1ElementOp{};
+  auto c_element_op = CElementOp{};
+
+  auto op = DeviceOpInstance{};
+  auto invoker = op.MakeInvoker();
+
+  auto arg_ptr = op.MakeArgumentPointer(
+      param.q_ptrs,
+      param.k_ptrs,
+      param.v_ptrs,
+      param.out_ptrs,
+      param.randvals_ptrs,
+      param.logsumexp_ptrs,
+      {}, // p_acc0_biases
+      {}, // p_acc1_biases
+      problem_descs,
+      a_element_op,
+      b0_element_op,
+      acc0_element_op,
+      b1_element_op,
+      c_element_op,
+      param.dropout_prob, // dropout ratio
+      {seed, offset});
+
+  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+  if (!op.IsSupportedArgument(arg_ptr.get())) {
+    std::ostringstream ostr;
+
+    ostr << op.GetTypeString() << " does not support this problem";
+
+    throw std::runtime_error(ostr.str());
+  }
+
+  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
new file mode 100644
index 000000000..741d6656c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/sequence.hpp>
+
+#include "ck_fmha_util.h"
+
+template <typename scalar_t, int32_t custom_mask_typ>
+void grouped_infer_mask_type_dispatched(
+    GroupedInferParams& param,
+    hipStream_t stream);
+
+template <typename scalar_t>
+void grouped_infer(GroupedInferParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    grouped_infer_mask_type_dispatched<scalar_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    grouped_infer_mask_type_dispatched<scalar_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    grouped_infer_mask_type_dispatched<scalar_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
+
+template <typename scalar_t, int32_t custom_mask_type = 0>
+void grouped_infer_mask_type_dispatched(
+    GroupedInferParams& param,
+    hipStream_t stream) {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc1BiasDataType = ck::Tuple<>;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+
+  using DeviceOpInstance = ck::tensor_operation::device::
+      DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          64, // Gemm1NPerBlock
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          2, // Gemm1NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          8,
+          8,
+          true,
+          S<16, 16, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          4,
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          2, // CShuffleNXdlPerWavePerShuffle
+          S<1,
+            32,
+            1,
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          MaskingSpec>; // MaskingSpecialization
+
+  std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+  for (std::size_t i = 0; i < param.num_batches; i++) {
+    int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+    int N = param.host_seqlen_k.empty()
+        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+        : param.host_seqlen_k[i];
+    int K = param.K;
+    int Kv = param.Kv;
+    int G0 = 1;
+    int G1 = param.num_heads;
+
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+    // to be changed to b1_gs_ns_os_lengths
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, Kv, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides = {
+        0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, Kv};
+    std::vector<ck::index_t> c_gs_ms_os_strides = {
+        0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+    problem_descs.push_back(
+        {a_gs_ms_ks_lengths,
+         a_gs_ms_ks_strides,
+         b0_gs_ns_ks_lengths,
+         b0_gs_ns_ks_strides,
+         b1_gs_os_ns_lengths,
+         b1_gs_os_ns_strides,
+         c_gs_ms_os_lengths,
+         c_gs_ms_os_strides,
+         {}, // acc0_biases_gs_ms_ns_lengths
+         {}, // acc0_biases_gs_ms_ns_strides
+         {}, // acc1_biases_gs_ms_os_lengths
+         {}}); // acc1_biases_gs_ms_os_strides
+  }
+
+  auto a_element_op = AElementOp{};
+  auto b0_element_op = B0ElementOp{};
+  auto acc0_element_op = Acc0ElementOp{1.0f};
+  auto b1_element_op = B1ElementOp{};
+  auto c_element_op = CElementOp{};
+
+  auto op = DeviceOpInstance{};
+  auto invoker = op.MakeInvoker();
+  auto arg_ptr = op.MakeArgumentPointer(
+      param.q_ptrs,
+      param.k_ptrs,
+      param.v_ptrs,
+      param.out_ptrs,
+      {}, // p_acc0_biases
+      {}, // p_acc1_biases
+      problem_descs,
+      a_element_op,
+      b0_element_op,
+      acc0_element_op,
+      b1_element_op,
+      c_element_op);
+
+  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+  if (!op.IsSupportedArgument(arg_ptr.get())) {
+    std::ostringstream ostr;
+
+    ostr << op.GetTypeString() << " does not support this problem";
+
+    throw std::runtime_error(ostr.str());
+  }
+
+  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
new file mode 100644
index 000000000..8606e6a93
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -0,0 +1,369 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <torch/torch.h>
+
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/sequence.hpp>
+
+// Here flag can be a constant, variable or function call
+#define FMHA_HIP_CHECK(ret_or_call)                                          \
+  do {                                                                       \
+    hipError_t _tmpVal;                                                      \
+    if ((_tmpVal = ret_or_call) != hipSuccess) {                             \
+      std::ostringstream ostr;                                               \
+      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+           << hipGetErrorString(_tmpVal);                                    \
+      throw std::runtime_error(ostr.str());                                  \
+    }                                                                        \
+  } while (0)
+
+#define XFORMERS_CHECK(COND, ERR)          \
+  if (!(COND)) {                           \
+    std::ostringstream ostr;               \
+    ostr << "'" #COND "' failed: " << ERR; \
+    throw std::runtime_error(ostr.str());  \
+  }
+
+#define DISPATCH_TYPES(InDataType, func)                                 \
+  {                                                                      \
+    if (InDataType == at::ScalarType::Half) {                            \
+      using scalar_t = ck::half_t;                                       \
+      func();                                                            \
+    } else if (InDataType == at::ScalarType::BFloat16) {                 \
+      using scalar_t = ck::bhalf_t;                                      \
+      func();                                                            \
+    } else {                                                             \
+      XFORMERS_CHECK(                                                    \
+          false, "Only half & bf16 input type supported at the moment"); \
+    }                                                                    \
+  }
+
+template <typename scalar_t>
+struct CkToAtenDtype;
+
+template <>
+struct CkToAtenDtype<ck::half_t> {
+  using scalar_t = ck::half_t;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Half;
+  }
+};
+
+template <>
+struct CkToAtenDtype<ck::bhalf_t> {
+  using scalar_t = ck::bhalf_t;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::BFloat16;
+  }
+};
+
+template <>
+struct CkToAtenDtype<float> {
+  using scalar_t = float;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Float;
+  }
+};
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype) {
+  if (dtype == at::ScalarType::Float) {
+    return n * 4;
+  } else if (dtype == at::ScalarType::Half) {
+    return n * 2;
+  } else if (dtype == at::ScalarType::BFloat16) {
+    return n * 2;
+  } else if (dtype == at::ScalarType::Short) {
+    return n * 2;
+  } else if (dtype == at::ScalarType::Int) {
+    return n * 4;
+  } else if (dtype == at::ScalarType::Byte) {
+    return n;
+  }
+  return 0;
+}
+
+/**
+ * kernels expect 4D bias/bias.grad with shape
+ * (batch_sz, n_heads, n_queries, n_keys). common bias shapes users may pass
+ * are:
+ * - (n_queries, n_keys)
+ * - (batch_sz * n_heads, n_queries, n_keys)
+ * - (batch_sz, n_heads, n_queries, n_keys)
+ *
+ * expand the bias as needed - be careful to only create a view with different
+ * shape/strides, no copies allowed.
+ */
+inline at::Tensor get_bias_4d_view(
+    const at::Tensor& bias,
+    int batch_sz,
+    int n_heads,
+    int n_queries,
+    int n_keys) {
+  TORCH_CHECK(
+      bias.size(-2) == n_queries,
+      "bias.size(-2) != n_queries: ",
+      bias.size(-2),
+      " != ",
+      n_queries);
+  TORCH_CHECK(
+      bias.size(-1) == n_keys,
+      "bias.size(-1) != n_keys: ",
+      bias.size(-1),
+      " != ",
+      n_keys);
+  switch (bias.dim()) {
+    case 2: // (n_queries, n_keys) - broadcast across all batches and heads
+      return bias.unsqueeze(0).unsqueeze(0).expand(
+          {batch_sz, n_heads, n_queries, n_keys});
+    case 3: // (batch_sz * n_heads, n_queries, n_keys) - just reshape
+      TORCH_CHECK(bias.size(0) == batch_sz * n_heads);
+      return bias.view({batch_sz, n_heads, n_queries, n_keys});
+    case 4: // (batch_sz, n_heads, n_queries, n_keys) - do nothing
+      TORCH_CHECK(bias.size(0) == batch_sz);
+      TORCH_CHECK(bias.size(1) == n_heads)
+      return bias;
+    default:
+      TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
+  }
+}
+
+template <typename scalar_t>
+struct MaxVectorSizeForType {
+  static constexpr int value = 4;
+};
+
+template <>
+struct MaxVectorSizeForType<ck::half_t> {
+  static constexpr int value = 8;
+};
+
+template <>
+struct MaxVectorSizeForType<ck::bhalf_t> {
+  static constexpr int value = 8;
+};
+
+struct SimpleDeviceMem {
+  SimpleDeviceMem() = delete;
+  SimpleDeviceMem(std::size_t mem_size) : p_mem_{} {
+    FMHA_HIP_CHECK(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
+  }
+  void* GetDeviceBuffer() {
+    return p_mem_;
+  }
+  ~SimpleDeviceMem() {
+    (void)hipFree(p_mem_);
+  }
+
+  void* p_mem_;
+};
+
+struct BatchedInferParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+
+  // BMHK mode strides
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> out_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* out_ptr;
+};
+
+struct BatchedForwardParams : public BatchedInferParams {
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // completely contiguous
+  void* logsumexp_ptr;
+
+  // BHMN mode strides, completely contiguous
+  std::array<int32_t, 4> randvals_strides;
+  void* randvals_ptr;
+};
+
+struct GroupedInferParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<void*> out_ptrs;
+
+  uint8_t custom_mask_type;
+};
+
+struct GroupedForwardParams : public GroupedInferParams {
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // completely contiguous
+  std::vector<void*> logsumexp_ptrs;
+
+  // HMN mode strides, completely contiguous
+  std::array<int, 3> randvals_strides;
+  std::vector<void*> randvals_ptrs;
+};
+
+struct BatchedBackwardParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+
+  // BMHK mode strides, last-dim contiguous
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+  std::array<int, 4> out_strides;
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+  const void* out_ptr;
+
+  uint8_t custom_mask_type;
+
+  std::array<int, 4> grad_out_strides;
+
+  const void* grad_out_ptr;
+
+  void* grad_q_ptr;
+  void* grad_k_ptr;
+  void* grad_v_ptr;
+  // void* grad_bias_ptr;
+
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // completely contiguous
+  const void* logsumexp_ptr;
+
+  // BHMN mode strides, completely contiguous
+  std::array<int, 4> randvals_strides;
+  void* randvals_ptr;
+
+  int64_t rng_seed;
+  int64_t rng_offset;
+};
+
+struct GroupedBackwardParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<const void*> out_ptrs;
+
+  uint8_t custom_mask_type;
+
+  std::array<int, 3> grad_out_strides;
+
+  std::vector<const void*> grad_out_ptrs;
+
+  std::vector<void*> grad_q_ptrs;
+  std::vector<void*> grad_k_ptrs;
+  std::vector<void*> grad_v_ptrs;
+  // std::vector<void *> grad_bias_ptrs;
+
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // HM mode strides, completely contiguous
+  std::vector<const void*> logsumexp_ptrs;
+
+  // HMN mode strides, completely contiguous
+  std::array<int, 3> randvals_strides;
+  std::vector<void*> randvals_ptrs;
+
+  int64_t rng_seed;
+  int64_t rng_offset;
+};
+
+// useful aliasing for making the codes easy
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;

From 88a0451806e6298987e9c82d3206fc9749f5833c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 8 Aug 2023 23:10:20 +0000
Subject: [PATCH 006/641] Tiny change in setup.py

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 76c0d274e..4b1cb3c3f 100644
--- a/setup.py
+++ b/setup.py
@@ -282,7 +282,6 @@ def get_extensions():
                          Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'device',
                          Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'device' / 'impl',
                          Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'element',
-                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'library' / 'include' / 'ck' / 'libary' / 'utility', 
                        ] 
        generator_flag = []
        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]

From 4449da03a6f195304d4e106a9f4001e25e2202ff Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 9 Aug 2023 20:36:53 +0000
Subject: [PATCH 007/641] Modification related to the using of alpha

---
 .../hip_fmha/attention_backward_generic.cu    | 371 ----------------
 .../hip_fmha/attention_forward_generic.cpp    |   1 -
 .../hip_fmha/attention_forward_generic.cu     | 400 ------------------
 .../hip_fmha/ck_fmha_batched_backward.h       |   2 +-
 .../hip_fmha/ck_fmha_batched_forward.h        |   5 +-
 .../hip_fmha/ck_fmha_batched_infer.h          |   4 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       |   2 +-
 .../hip_fmha/ck_fmha_grouped_forward.h        |   2 +-
 .../hip_fmha/ck_fmha_grouped_infer.h          |   4 +-
 9 files changed, 11 insertions(+), 780 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/attention_backward_generic.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_generic.cu

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cu b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cu
deleted file mode 100644
index 2756763ce..000000000
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cu
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <cmath>
-
-#include <ATen/Context.h>
-#include <ATen/ScalarOps.h>
-#include <ATen/Tensor.h>
-#include <ATen/TensorOperators.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "ck_fmha_util.h"
-#include "ck_fmha_batched_backward.h"
-#include "ck_fmha_grouped_backward.h"
-
-namespace {
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-mem_efficient_attention_backward_hip(
-    const at::Tensor& grad_out,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const c10::optional<at::Tensor>& bias, // additive attention bias
-    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
-    // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& seqstart_q,
-    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
-    // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& seqstart_k,
-    const c10::optional<at::Tensor>& seqlen_k,
-    const at::Tensor& logsumexp,
-    const at::Tensor& out,
-    double dropout_p, // dropout probability
-    int64_t rng_seed, // seed using for generating random numbers for dropout
-    int64_t rng_offset, // offset into random number sequence
-    int64_t custom_mask_type,
-    const c10::optional<double> scale) {
-#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
-  TORCH_CHECK(
-      false,
-      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
-#else
-  at::globalContext().alertNotDeterministic(
-      "mem_efficient_attention_backward_cutlass");
-
-  // ndim
-  TORCH_CHECK(query.dim() == grad_out.dim());
-  TORCH_CHECK(query.dim() == key.dim());
-  TORCH_CHECK(query.dim() == value.dim());
-  TORCH_CHECK(query.dim() == 4);
-
-  // batch size
-  TORCH_CHECK(query.size(0) == grad_out.size(0));
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // seqlen
-  TORCH_CHECK(key.size(1) == value.size(1));
-  TORCH_CHECK(query.size(1) == grad_out.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
-  TORCH_CHECK(query.size(2) == grad_out.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-  TORCH_CHECK(value.size(3) == grad_out.size(3));
-
-  // handle potentially non-contiguous grad_out through a copy
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
-
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  TORCH_CHECK(
-      !(seqstart_q.has_value() && bias.has_value()),
-      "seqstart_q + bias not supported");
-
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
-  }
-
-  at::cuda::CUDAGuard device_guard(query.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t num_heads = query.size(2);
-  int64_t K = query.size(3);
-  int64_t Kv = value.size(3);
-
-  at::Tensor grad_q, grad_k, grad_v, grad_bias;
-
-  grad_q = at::empty(query.sizes(), query.options());
-  grad_k = at::empty(key.sizes(), key.options());
-  grad_v = at::empty(value.sizes(), value.options());
-
-  at::Tensor randvals;
-
-  at::PhiloxCudaState rng_engine_inputs(rng_seed, rng_offset);
-
-  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.num_heads = num_heads;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.grad_out_ptr = grad_out.data_ptr();
-    p.grad_q_ptr = grad_q.data_ptr();
-    p.grad_k_ptr = grad_k.data_ptr();
-    p.grad_v_ptr = grad_v.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.grad_out_strides = {
-        static_cast<int>(grad_out.stride(0)),
-        static_cast<int>(grad_out.stride(1)),
-        static_cast<int>(grad_out.stride(2)),
-        static_cast<int>(grad_out.stride(3))};
-
-    if (bias.has_value()) {
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
-
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.attn_bias_ptr = nullptr;
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.rng_engine_inputs = rng_engine_inputs;
-
-    randvals = at::empty(
-        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2)),
-        static_cast<int>(randvals.stride(3))};
-    p.randvals_ptr = randvals.data_ptr();
-
-    p.logsumexp_ptr = logsumexp.data_ptr();
-  };
-
-  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.num_heads = num_heads;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    p.grad_out_strides = {
-        static_cast<int>(grad_out.stride(1)),
-        static_cast<int>(grad_out.stride(2)),
-        static_cast<int>(grad_out.stride(3))};
-
-    if (bias.has_value()) {
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    };
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.rng_engine_inputs = rng_engine_inputs;
-
-    randvals = at::empty(
-        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2))};
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
-
-    if (seqlen_k.has_value())
-      p.host_seqlen_k.resize(p.num_batches);
-
-    FMHA_HIP_CHECK(hipMemcpy(
-        p.host_seqstart_q.data(),
-        seqstart_q->data_ptr(),
-        (p.num_batches + 1) * sizeof(int),
-        hipMemcpyDeviceToHost));
-    FMHA_HIP_CHECK(hipMemcpy(
-        p.host_seqstart_k.data(),
-        seqstart_k->data_ptr(),
-        (p.num_batches + 1) * sizeof(int),
-        hipMemcpyDeviceToHost));
-    if (seqlen_k.has_value())
-      FMHA_HIP_CHECK(hipMemcpy(
-          p.host_seqlen_k.data(),
-          seqlen_k->data_ptr(),
-          p.num_batches * sizeof(int),
-          hipMemcpyDeviceToHost));
-
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
-    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
-
-    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
-
-    char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
-    char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
-    char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
-
-    for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_q_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
-      int32_t tmp_k_stride = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
-      int32_t tmp_v_stride = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
-      int32_t tmp_o_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
-      int32_t tmp_grad_o_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.grad_out_strides[0],
-          grad_out_.scalar_type());
-      int32_t tmp_logsumexp_stride =
-          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
-      int32_t tmp_randvals_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.randvals_strides[1] +
-              p.host_seqstart_k[i] * p.randvals_strides[2],
-          randvals.scalar_type());
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
-      p.grad_q_ptrs.push_back(reinterpret_cast<void*>(grad_q_ptr));
-
-      q_ptr = q_ptr + tmp_q_stride;
-      grad_q_ptr = grad_q_ptr + tmp_q_stride;
-
-      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
-      p.grad_k_ptrs.push_back(reinterpret_cast<void*>(grad_k_ptr));
-      k_ptr = k_ptr + tmp_k_stride;
-      grad_k_ptr = grad_k_ptr + tmp_k_stride;
-
-      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
-      p.grad_v_ptrs.push_back(reinterpret_cast<void*>(grad_v_ptr));
-      v_ptr = v_ptr + tmp_k_stride;
-      grad_v_ptr = grad_v_ptr + tmp_k_stride;
-
-      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
-      p.grad_out_ptrs.push_back(reinterpret_cast<void*>(grad_out_ptr));
-      out_ptr = out_ptr + tmp_o_stride;
-      grad_out_ptr = grad_out_ptr + tmp_o_stride;
-
-      if (bias.has_value()) {
-        int32_t tmp_bias_stride = get_size_in_bytes(
-            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
-                p.host_seqstart_k[i] * p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
-        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
-      };
-
-      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
-      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
-
-      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
-      randvals_ptr = randvals_ptr + tmp_randvals_stride;
-    }
-  };
-
-  DISPATCH_TYPES(query.scalar_type(), [&]() {
-    if (!seqstart_q.has_value()) { // input is batched
-      BatchedBackwardParams batched_backward_params;
-
-      set_batched_backward_params(batched_backward_params);
-      batched_backward<scalar_t>(batched_backward_params, stream)
-    } else { // input is grouped
-      GroupedBackwardParams grouped_backward_params;
-
-      set_grouped_backward_params(grouped_backward_params);
-      grouped_backward<scalar_t>(grouped_backward_params, stream);
-    }
-  });
-
-  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
-#endif
-} // namespace
-
-} // namespace
-
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_hip"),
-      TORCH_FN(mem_efficient_attention_backward_hip));
-}
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 667d63370..e37e858cc 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -44,7 +44,6 @@ efficient_attention_forward_hip(
     // position of the first key token for batch $b
     const c10::optional<at::Tensor>& seqstart_k,
     // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
     double dropout_p, // attention matrix dropout probability
     bool compute_logsumexp,
     int64_t custom_mask_type,
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cu b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cu
deleted file mode 100644
index d951dbcbf..000000000
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cu
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <cmath>
-#include <mutex>
-
-#include <ATen/Context.h>
-#include <ATen/ScalarOps.h>
-#include <ATen/Tensor.h>
-#include <ATen/core/Generator.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/util/Optional.h>
-#include <torch/library.h>
-
-#include "ck_fmha_util.h"
-#include "ck_fmha_batched_infer.h"
-#include "ck_fmha_batched_forward.h"
-#include "ck_fmha_grouped_infer.h"
-#include "ck_fmha_grouped_forward.h"
-
-namespace {
-
-/*
-  There are 2 modes for using this function.
-  (Mode BMHK) With all the heads having the same seqlen
-  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
-*/
-std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
-efficient_attention_forward_hip(
-    const at::Tensor& query, // [b, seqlen, num_heads, K]
-    const at::Tensor& key, // [b, seqlen, num_heads, K]
-    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
-    const c10::optional<at::Tensor>& bias, // [b, num_heads, seqlen, seqlen]
-    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
-    // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& seqstart_q,
-    // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
-    // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& seqstart_k,
-    // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
-    double dropout_p, // attention matrix dropout probability
-    bool compute_logsumexp,
-    int64_t custom_mask_type,
-    c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k) {
-#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD
-  TORCH_CHECK(
-      false,
-      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD");
-#else
-
-  TORCH_CHECK(query.dim() == 4);
-  TORCH_CHECK(key.dim() == 4);
-  TORCH_CHECK(value.dim() == 4);
-
-  // Batch sizes
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // Sequence length
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-  };
-
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t num_heads = query.size(-2);
-  int64_t K = query.size(-1);
-  int64_t Kv = value.size(-1);
-
-  at::Tensor out;
-  at::Tensor logsumexp;
-  at::Tensor randvals;
-
-  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-  at::PhiloxCudaState rng_engine_inputs;
-  if (use_dropout) {
-    at::CUDAGeneratorImpl* gen =
-        at::get_generator_or_default<at::CUDAGeneratorImpl>(
-            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    // if using dropout, we produce 1 random number for each element of the
-    // attention tensor
-    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
-  }
-
-  auto set_batched_infer_params = [&](BatchedInferParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.num_heads = num_heads;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(0)),
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.attn_bias_ptr = nullptr;
-
-    p.custom_mask_type = custom_mask_type;
-  };
-
-  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-    set_batched_infer_params(p);
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-
-    p.rng_engine_inputs = rng_engine_inputs;
-
-    randvals = at::empty(
-        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2)),
-        static_cast<int>(randvals.stride(3))};
-    p.randvals_ptr = randvals.data_ptr();
-
-    logsumexp = at::empty(
-        {B, num_heads, M}, query.options().dtype(at::ScalarType::Float));
-    p.logsumexp_ptr = logsumexp.data_ptr();
-  };
-
-  auto set_grouped_infer_params = [&](GroupedInferParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.num_heads = num_heads;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    };
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
-
-    if (seqlen_k.has_value())
-      p.host_seqlen_k.resize(p.num_batches);
-
-    FMHA_HIP_CHECK(hipMemcpy(
-        p.host_seqstart_q.data(),
-        seqstart_q->data_ptr(),
-        (p.num_batches + 1) * sizeof(int32_t),
-        hipMemcpyDeviceToHost));
-    FMHA_HIP_CHECK(hipMemcpy(
-        p.host_seqstart_k.data(),
-        seqstart_k->data_ptr(),
-        (p.num_batches + 1) * sizeof(int32_t),
-        hipMemcpyDeviceToHost));
-    if (seqlen_k.has_value())
-      FMHA_HIP_CHECK(hipMemcpy(
-          p.host_seqlen_k.data(),
-          seqlen_k->data_ptr(),
-          p.num_batches * sizeof(int32_t),
-          hipMemcpyDeviceToHost));
-
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
-
-    for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_q_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
-      int32_t tmp_k_stride = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
-      int32_t tmp_v_stride = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
-      int32_t tmp_o_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
-      q_ptr = q_ptr + tmp_q_stride;
-
-      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
-      k_ptr = k_ptr + tmp_k_stride;
-
-      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
-      v_ptr = v_ptr + tmp_k_stride;
-
-      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
-      out_ptr = out_ptr + tmp_o_stride;
-
-      if (bias.has_value()) {
-        int32_t tmp_bias_stride = get_size_in_bytes(
-            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
-                p.host_seqstart_k[i] * p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
-        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
-      };
-    }
-  };
-
-  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-    set_grouped_infer_params(p);
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.rng_engine_inputs = rng_engine_inputs;
-
-    logsumexp =
-        at::empty({num_heads, M}, query.options().dtype(at::ScalarType::Float));
-
-    randvals = at::empty(
-        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2))};
-
-    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
-
-    for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_logsumexp_stride =
-          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
-      int32_t tmp_randvals_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.randvals_strides[1] +
-              p.host_seqstart_k[i] * p.randvals_strides[2],
-          randvals.scalar_type());
-
-      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
-      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
-
-      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
-      randvals_ptr = randvals_ptr + tmp_randvals_stride;
-    };
-  };
-
-  // uint64_t -> int64_t bitwise casting as PyTorch don't support uint64_t
-  // so just fake it as a int64_t
-  int64_t seed, offset;
-
-  DISPATCH_TYPES(query.scalar_type(),  [&]() {
-    out = at::empty(
-        {B, M, num_heads, Kv},
-        query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
-
-    if (!use_dropout && !compute_logsumexp) { // work is inference
-      if (!seqstart_q.has_value()) { // input is batched
-        BatchedInferParams batched_infer_params;
-
-        set_batched_infer_params(batched_infer_params);
-        batched_infer<scalar_t>(batched_infer_params, stream);
-      } else { // input is grouped
-        GroupedInferParams grouped_infer_params;
-
-        set_grouped_infer_params(grouped_infer_params);
-        grouped_infer<scalar_t>(grouped_infer_params, stream);
-      }
-    } else { // work is training forward
-      if (!seqstart_q.has_value()) { // input is batched
-        BatchedForwardParams batched_forward_params;
-
-        set_batched_forward_params(batched_forward_params);
-        batched_forward<scalar_t>(batched_forward_params, stream)
-      } else { // input is grouped
-        GroupedForwardParams grouped_forward_params;
-
-        set_grouped_forward_params(grouped_forward_params);
-        grouped_forward<scalar_t>(grouped_forward_params, stream);
-      }
-
-      std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
-      std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
-    }
-  });
-
-  return std::make_tuple(out, logsumexp, seed, offset);
-#endif
-}
-
-// For testing in xFormers
-bool is_ck_fmha_available() {
-  std::cout << "ck fmha is really here!" << std::endl;
-  return (true);
-};
-
-} // namespace
-
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_hip"),
-      TORCH_FN(efficient_attention_forward_hip));
-}
-
-TORCH_LIBRARY_FRAGMENT(xformers, m) {
-  m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available() -> bool"));
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
-      TORCH_FN(is_ck_fmha_available));
-}
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 34969a513..b267b8590 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -184,7 +184,7 @@ void batched_backward_mask_type_dispatched(
 
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
-  float alpha = 1.f / std::sqrt(param.K);
+  float alpha = param.scale;
 
   auto op = DeviceOpInstance{};
   auto invoker = op.MakeInvoker();
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index f2f551ac7..1086e44cd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -148,9 +148,6 @@ void batched_forward_mask_type_dispatched(
           MaskingSpec, // MaskingSpecialization
           Deterministic>;
 
-  float p_dropout = 1 - param.dropout_prob;
-  ZDataType p_dropout_in_16bits = ZDataType(std::floor(p_dropout * 65535.0));
-  float rp_dropout = 1.0 / p_dropout;
   float alpha = 1.f / std::sqrt(param.K);
 
   std::vector<ck::index_t> a_gs_ms_ks_lengths{
@@ -196,6 +193,8 @@ void batched_forward_mask_type_dispatched(
 
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
+  float alpha = param.scale;
+
   auto a_element_op = AElementOp{};
   auto b0_element_op = B0ElementOp{};
   auto acc0_element_op = Acc0ElementOp{alpha};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index cc8129a80..58867e602 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -172,9 +172,11 @@ void batched_infer_mask_type_dispatched(
       param.out_strides[1],
       param.out_strides[3]};
 
+  float alpha = param.scale;
+
   auto a_element_op = AElementOp{};
   auto b0_element_op = B0ElementOp{};
-  auto acc0_element_op = Acc0ElementOp{1.0f};
+  auto acc0_element_op = Acc0ElementOp{alpha};
   auto b1_element_op = B1ElementOp{};
   auto c_element_op = CElementOp{};
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index fb4879fc0..62ce0df01 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -203,7 +203,7 @@ void grouped_backward_mask_type_dispatched(
     });
   }
 
-  float alpha = 1.0f / std::sqrt(param.K);
+  float alpha = param.scale;
 
   auto op = DeviceOpInstance{};
   auto invoker = op.MakeInvoker();
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index d7b980f00..9ba0d07a3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -210,7 +210,7 @@ void grouped_forward_mask_type_dispatched(
   const uint64_t seed = 1;
   const uint64_t offset = 0;
 
-  float alpha = 1.0f;
+  float alpha = param.scale;
 
   auto a_element_op = AElementOp{};
   auto b0_element_op = B0ElementOp{};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 741d6656c..46bc95ece 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -185,9 +185,11 @@ void grouped_infer_mask_type_dispatched(
          {}}); // acc1_biases_gs_ms_os_strides
   }
 
+  float alpha = param.scale;
+
   auto a_element_op = AElementOp{};
   auto b0_element_op = B0ElementOp{};
-  auto acc0_element_op = Acc0ElementOp{1.0f};
+  auto acc0_element_op = Acc0ElementOp{alpha};
   auto b1_element_op = B1ElementOp{};
   auto c_element_op = CElementOp{};
 

From 2245107202225e2d241107af417635b93f967b66 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 9 Aug 2023 20:50:15 +0000
Subject: [PATCH 008/641] Tiny fix in ck_fmha_batched_forward.h

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 1086e44cd..c5384e25b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -148,8 +148,6 @@ void batched_forward_mask_type_dispatched(
           MaskingSpec, // MaskingSpecialization
           Deterministic>;
 
-  float alpha = 1.f / std::sqrt(param.K);
-
   std::vector<ck::index_t> a_gs_ms_ks_lengths{
       param.B, param.num_heads, param.M, param.K};
   std::vector<ck::index_t> a_gs_ms_ks_strides{

From 52dff20bec89cbf05c0a6d8dd592efc1a2daeff6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 10 Aug 2023 14:46:57 +0000
Subject: [PATCH 009/641] Synchronize update in third_party/composable_kernel

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 34b1c3208..d20c472f8 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 34b1c32087cd29f856a6d62bb33ba64df36e46a6
+Subproject commit d20c472f8d5a00da0934e91f3ddc16f7dd3e3ecb

From 1eb10a3861e3d7a01b1ce8f60e6f2c650233e6c7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 11 Aug 2023 21:55:39 +0000
Subject: [PATCH 010/641] Update to synchronize with the change in CK
 FlashAttentin forward to add support attention-bias

---
 .../hip_fmha/attention_backward_generic.cpp   |   2 +-
 .../hip_fmha/attention_forward_generic.cpp    | 159 ++++++------
 .../hip_fmha/ck_fmha_batched_forward.h        | 110 ++++++---
 .../hip_fmha/ck_fmha_batched_infer.h          | 226 ------------------
 .../hip_fmha/ck_fmha_grouped_forward.h        |  87 +++++--
 .../hip_fmha/ck_fmha_grouped_infer.h          | 225 -----------------
 .../csrc/attention/hip_fmha/ck_fmha_util.h    |  20 +-
 7 files changed, 237 insertions(+), 592 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 9abfe09e8..04a1ccf2b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -362,7 +362,7 @@ mem_efficient_attention_backward_hip(
 
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 #endif
-} // namespace
+}
 
 } // namespace
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index e37e858cc..f8baf6a8f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -19,9 +19,7 @@
 #include <torch/library.h>
 
 #include "ck_fmha_batched_forward.h"
-#include "ck_fmha_batched_infer.h"
 #include "ck_fmha_grouped_forward.h"
-#include "ck_fmha_grouped_infer.h"
 #include "ck_fmha_util.h"
 
 namespace {
@@ -115,7 +113,7 @@ efficient_attention_forward_hip(
     rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
   }
 
-  auto set_batched_infer_params = [&](BatchedInferParams& p) {
+  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
     p.B = B;
     p.M = M;
     p.N = N;
@@ -156,6 +154,7 @@ efficient_attention_forward_hip(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      p.has_attn_bias = true;
       p.attn_bias_ptr = bias->data_ptr();
 
       const at::Tensor bias_4d_view =
@@ -166,33 +165,41 @@ efficient_attention_forward_hip(
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
     } else
-      p.attn_bias_ptr = nullptr;
+      p.has_attn_bias = false;
 
     p.custom_mask_type = custom_mask_type;
-  };
 
-  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-    set_batched_infer_params(p);
+    p.use_dropout = use_dropout;
+    p.compute_logsumexp = compute_logsumexp;
 
-    p.dropout_prob = static_cast<float>(dropout_p);
+    // the following parameters are only used by training forward
+    if (p.use_dropout) {
+      p.dropout_prob = static_cast<float>(dropout_p);
 
-    p.rng_engine_inputs = rng_engine_inputs;
+      p.rng_engine_inputs = rng_engine_inputs;
 
-    randvals = at::empty(
-        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2)),
-        static_cast<int>(randvals.stride(3))};
-    p.randvals_ptr = randvals.data_ptr();
+      randvals = at::empty(
+          {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+      p.randvals_strides = {
+          static_cast<int>(randvals.stride(0)),
+          static_cast<int>(randvals.stride(1)),
+          static_cast<int>(randvals.stride(2)),
+          static_cast<int>(randvals.stride(3))};
+      p.randvals_ptr = randvals.data_ptr();
+    } else {
+      p.dropout_prob = 0.0f;
+      p.randvals_ptr = nullptr;
+    };
 
-    logsumexp = at::empty(
-        {B, num_heads, M}, query.options().dtype(at::ScalarType::Float));
-    p.logsumexp_ptr = logsumexp.data_ptr();
+    if (p.compute_logsumexp) {
+      logsumexp = at::empty(
+          {B, num_heads, M}, query.options().dtype(at::ScalarType::Float));
+      p.logsumexp_ptr = logsumexp.data_ptr();
+    } else
+      p.logsumexp_ptr = nullptr;
   };
 
-  auto set_grouped_infer_params = [&](GroupedInferParams& p) {
+  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
     p.num_batches = seqstart_q->size(0) - 1;
     p.M = M;
     p.N = N;
@@ -288,6 +295,7 @@ efficient_attention_forward_hip(
       out_ptr = out_ptr + tmp_o_stride;
 
       if (bias.has_value()) {
+        p.has_attn_bias = true;
         int32_t tmp_bias_stride = get_size_in_bytes(
             p.host_seqstart_q[i] * p.attn_bias_strides[2] +
                 p.host_seqstart_k[i] * p.attn_bias_strides[3],
@@ -295,42 +303,49 @@ efficient_attention_forward_hip(
 
         p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
         attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
-      };
+      } else
+        p.has_attn_bias = false;
     }
-  };
 
-  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-    set_grouped_infer_params(p);
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.rng_engine_inputs = rng_engine_inputs;
-
-    logsumexp =
-        at::empty({num_heads, M}, query.options().dtype(at::ScalarType::Float));
+    p.use_dropout = use_dropout;
+    p.compute_logsumexp = compute_logsumexp;
+
+    // the following parameters are only used by training forward
+    if (p.use_dropout) {
+      p.dropout_prob = static_cast<float>(dropout_p);
+      p.rng_engine_inputs = rng_engine_inputs;
+
+      randvals = at::empty(
+          {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
+      p.randvals_strides = {
+          static_cast<int>(randvals.stride(0)),
+          static_cast<int>(randvals.stride(1)),
+          static_cast<int>(randvals.stride(2))};
+      char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
+
+      for (int i = 0; i < p.num_batches; i++) {
+        int32_t tmp_randvals_stride = get_size_in_bytes(
+            p.host_seqstart_q[i] * p.randvals_strides[1] +
+                p.host_seqstart_k[i] * p.randvals_strides[2],
+            randvals.scalar_type());
+
+        p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
+        randvals_ptr = randvals_ptr + tmp_randvals_stride;
+      };
+    };
 
-    randvals = at::empty(
-        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2))};
+    if (p.compute_logsumexp) {
+      logsumexp = at::empty(
+          {num_heads, M}, query.options().dtype(at::ScalarType::Float));
+      char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
-    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
+      for (int i = 0; i < p.num_batches; i++) {
+        int32_t tmp_logsumexp_stride =
+            get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
 
-    for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_logsumexp_stride =
-          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
-      int32_t tmp_randvals_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.randvals_strides[1] +
-              p.host_seqstart_k[i] * p.randvals_strides[2],
-          randvals.scalar_type());
-
-      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
-      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
-
-      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
-      randvals_ptr = randvals_ptr + tmp_randvals_stride;
+        p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
+        logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
+      };
     };
   };
 
@@ -343,36 +358,22 @@ efficient_attention_forward_hip(
         {B, M, num_heads, Kv},
         query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
 
-    if (!use_dropout && !compute_logsumexp) { // work is inference
-      if (!seqstart_q.has_value()) { // input is batched
-        BatchedInferParams batched_infer_params;
-
-        set_batched_infer_params(batched_infer_params);
-        batched_infer<scalar_t>(batched_infer_params, stream);
-      } else { // input is grouped
-        GroupedInferParams grouped_infer_params;
-
-        set_grouped_infer_params(grouped_infer_params);
-        grouped_infer<scalar_t>(grouped_infer_params, stream);
-      }
-    } else { // work is training forward
-      if (!seqstart_q.has_value()) { // input is batched
-        BatchedForwardParams batched_forward_params;
-
-        set_batched_forward_params(batched_forward_params);
-        batched_forward<scalar_t>(batched_forward_params, stream);
-      } else { // input is grouped
-        GroupedForwardParams grouped_forward_params;
-
-        set_grouped_forward_params(grouped_forward_params);
-        grouped_forward<scalar_t>(grouped_forward_params, stream);
-      }
-
-      std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
-      std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
+    if (!seqstart_q.has_value()) { // input is batched
+      BatchedForwardParams batched_forward_params;
+
+      set_batched_forward_params(batched_forward_params);
+      batched_forward<scalar_t>(batched_forward_params, stream);
+    } else { // input is grouped
+      GroupedForwardParams grouped_forward_params;
+
+      set_grouped_forward_params(grouped_forward_params);
+      grouped_forward<scalar_t>(grouped_forward_params, stream);
     }
   });
 
+  std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
+  std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
+
   return std::make_tuple(out, logsumexp, seed, offset);
 #endif
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index c5384e25b..f2fb0a69d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -5,31 +5,46 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v1.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp"
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type>
-void batched_forward_mask_type_dispatched(
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void batched_forward_masktype_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream);
 
 template <typename scalar_t>
 void batched_forward(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    batched_forward_mask_type_dispatched<scalar_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    batched_forward_mask_type_dispatched<scalar_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    batched_forward_mask_type_dispatched<scalar_t, 2>(param, stream);
-  else
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<scalar_t, 0, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<scalar_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<scalar_t, 1, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<scalar_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<scalar_t, 2, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<scalar_t, 2, false>(
+          param, stream);
+  } else
     throw std::runtime_error("Invalid custom_mask_type value");
 };
 
-template <typename scalar_t, int32_t custom_mask_type = 0>
-void batched_forward_mask_type_dispatched(
+template <typename scalar_t, int32_t custom_mask_type = 0, bool has_attn_bias>
+void batched_forward_masktype_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -43,7 +58,8 @@ void batched_forward_mask_type_dispatched(
   using CDataType = scalar_t;
   using ZDataType = unsigned short;
   using LSEDataType = F32;
-  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc0BiasDataType = typename std::
+      conditional<has_attn_bias, ck::Tuple<scalar_t>, ck::Tuple<>>::type;
   using Acc1BiasDataType = ck::Tuple<>;
 
   static constexpr ck::index_t NumDimG = 2;
@@ -75,7 +91,7 @@ void batched_forward_mask_type_dispatched(
   static constexpr bool Deterministic = false;
 
   using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V1<
+      DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
           NumDimG,
           NumDimM,
           NumDimN,
@@ -107,7 +123,7 @@ void batched_forward_mask_type_dispatched(
           128, // MPerBlock
           128, // NPerBlock
           32, // KPerBlock
-          32, // Gemm1NPerBlock
+          64, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           8, // AK1
           8, // BK1
@@ -116,7 +132,8 @@ void batched_forward_mask_type_dispatched(
           32, // NPerXDL
           1, // MXdlPerWave
           4, // NXdlPerWave
-          1, // Gemm1NXdlPerWave
+          2, // Gemm1NXdlPerWave
+          1, // DropoutStep
           S<4, 64, 1>, // ABlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
@@ -131,20 +148,22 @@ void batched_forward_mask_type_dispatched(
           8,
           8,
           true,
+          4,
           S<16, 16, 1>, // B1BlockTransfer
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,
-          2,
+          4,
           2,
           false,
           1, // CShuffleMXdlPerWavePerShuffle
-          1, // CShuffleNXdlPerWavePerShuffle
+          2, // CShuffleNXdlPerWavePerShuffle
           S<1,
-            64,
+            32,
             1,
-            4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
           8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          4,
           MaskingSpec, // MaskingSpecialization
           Deterministic>;
 
@@ -181,16 +200,43 @@ void batched_forward_mask_type_dispatched(
       param.out_strides[1],
       param.out_strides[3]};
 
-  std::vector<ck::index_t> z_gs_ms_ns_lengths{
-      param.B, param.num_heads, param.M, param.N};
-  std::vector<ck::index_t> z_gs_ms_ns_strides{
-      param.randvals_strides[0],
-      param.randvals_strides[1],
-      param.randvals_strides[2],
-      param.randvals_strides[3]};
+  std::vector<ck::index_t> z_gs_ms_ns_lengths;
+  std::vector<ck::index_t> z_gs_ms_ns_strides;
+
+  if (param.use_dropout) {
+    z_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+    z_gs_ms_ns_strides = {
+        param.randvals_strides[0],
+        param.randvals_strides[1],
+        param.randvals_strides[2],
+        param.randvals_strides[3]};
+  };
 
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
+  auto bias_ptr_lengths_strides = [&]() {
+    if constexpr (has_attn_bias) {
+      auto bias_ptr_arr =
+          std::array<void*, 1>{const_cast<void*>(param.attn_bias_ptr)};
+      std::vector<ck::index_t> d_gs_ms_ns_lengths{
+          param.B, param.num_heads, param.M, param.N};
+      std::vector<ck::index_t> d_gs_ms_ns_strides{
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+      auto bias_lengths_arr =
+          std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths};
+      auto bias_strides_arr =
+          std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides};
+      return std::make_tuple(bias_ptr_arr, bias_lengths_arr, bias_strides_arr);
+    } else
+      return std::make_tuple(
+          std::array<void*, 0>{},
+          std::array<std::vector<ck::index_t>, 0>{},
+          std::array<std::vector<ck::index_t>, 0>{});
+  }();
+
   float alpha = param.scale;
 
   auto a_element_op = AElementOp{};
@@ -205,6 +251,7 @@ void batched_forward_mask_type_dispatched(
 
   auto op = DeviceOpInstance{};
   auto invoker = op.MakeInvoker();
+
   auto arg_ptr = op.MakeArgumentPointer(
       param.q_ptr,
       param.k_ptr,
@@ -212,7 +259,7 @@ void batched_forward_mask_type_dispatched(
       param.out_ptr,
       param.randvals_ptr,
       param.logsumexp_ptr,
-      {}, // std::array<void*, 1> p_acc0_biases;
+      std::get<0>(bias_ptr_lengths_strides),
       {}, // std::array<void*, 1> p_acc1_biases;
       a_gs_ms_ks_lengths,
       a_gs_ms_ks_strides,
@@ -225,10 +272,8 @@ void batched_forward_mask_type_dispatched(
       z_gs_ms_ns_lengths,
       z_gs_ms_ns_strides,
       lse_gs_ms_lengths,
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc0_biases_gs_ms_ns_lengths},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc0_biases_gs_ms_ns_strides},
+      std::get<1>(bias_ptr_lengths_strides),
+      std::get<2>(bias_ptr_lengths_strides),
       {}, // std::array<std::vector<ck::index_t>,
           // 1>{acc1_biases_gs_ms_os_lengths},
       {}, // std::array<std::vector<ck::index_t>,
@@ -241,6 +286,7 @@ void batched_forward_mask_type_dispatched(
       param.dropout_prob, // dropout ratio
       {seed, offset}); // dropout random seed and offset, offset should be at
                        // least the number of elements on a thread
+
   SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
 
   op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
deleted file mode 100644
index 58867e602..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ /dev/null
@@ -1,226 +0,0 @@
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-
-#include "ck_fmha_util.h"
-
-template <typename scalar_t, int32_t custom_mask_type>
-void batched_infer_mask_type_dispatched(
-    BatchedInferParams& param,
-    hipStream_t stream);
-
-template <typename scalar_t>
-void batched_infer(BatchedInferParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    batched_infer_mask_type_dispatched<scalar_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    batched_infer_mask_type_dispatched<scalar_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    batched_infer_mask_type_dispatched<scalar_t, 2>(param, stream);
-  else
-    throw std::runtime_error("Invalid custom_mask_type value");
-};
-
-template <typename scalar_t, int32_t custom_mask_type = 0>
-void batched_infer_mask_type_dispatched(
-    BatchedInferParams& param,
-    hipStream_t stream) {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using Acc0BiasDataType = ck::Tuple<>;
-  using Acc1BiasDataType = ck::Tuple<>;
-
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
-  using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
-          64, // Gemm1NPerBlock
-          32, // Gemm1KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
-          2, // Gemm1NXdlPerWave
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          8,
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          8,
-          8,
-          true,
-          S<16, 16, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          4,
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
-          2, // CShuffleNXdlPerWavePerShuffle
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
-          MaskingSpec>; // MaskingSpecialization
-
-  std::vector<ck::index_t> a_gs_ms_ks_lengths{
-      param.B, param.num_heads, param.M, param.K};
-  std::vector<ck::index_t> a_gs_ms_ks_strides{
-      param.q_strides[0],
-      param.q_strides[2],
-      param.q_strides[1],
-      param.q_strides[3]};
-  std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-      param.B, param.num_heads, param.N, param.K};
-  std::vector<ck::index_t> b0_gs_ns_ks_strides{
-      param.k_strides[0],
-      param.k_strides[2],
-      param.k_strides[1],
-      param.k_strides[3]};
-
-  // to be changed to b1_gs_ns_os_lengths
-  std::vector<ck::index_t> b1_gs_os_ns_lengths{
-      param.B, param.num_heads, param.N, param.Kv};
-  std::vector<ck::index_t> b1_gs_os_ns_strides{
-      param.v_strides[0],
-      param.v_strides[2],
-      param.v_strides[3],
-      param.v_strides[1]};
-
-  std::vector<ck::index_t> c_gs_ms_os_lengths{
-      param.B, param.num_heads, param.M, param.Kv};
-  std::vector<ck::index_t> c_gs_ms_os_strides{
-      param.out_strides[0],
-      param.out_strides[2],
-      param.out_strides[1],
-      param.out_strides[3]};
-
-  float alpha = param.scale;
-
-  auto a_element_op = AElementOp{};
-  auto b0_element_op = B0ElementOp{};
-  auto acc0_element_op = Acc0ElementOp{alpha};
-  auto b1_element_op = B1ElementOp{};
-  auto c_element_op = CElementOp{};
-
-  auto op = DeviceOpInstance{};
-  auto invoker = op.MakeInvoker();
-  auto arg_ptr = op.MakeArgumentPointer(
-      param.q_ptr,
-      param.k_ptr,
-      param.v_ptr,
-      param.out_ptr,
-      {}, // std::array<void*, 1> p_acc0_biases;
-      {}, // std::array<void*, 1> p_acc1_biases;
-      a_gs_ms_ks_lengths,
-      a_gs_ms_ks_strides,
-      b0_gs_ns_ks_lengths,
-      b0_gs_ns_ks_strides,
-      b1_gs_os_ns_lengths,
-      b1_gs_os_ns_strides,
-      c_gs_ms_os_lengths,
-      c_gs_ms_os_strides,
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc0_biases_gs_ms_ns_lengths},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc0_biases_gs_ms_ns_strides},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc1_biases_gs_ms_os_lengths},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc1_biases_gs_ms_os_strides},
-      a_element_op,
-      b0_element_op,
-      acc0_element_op,
-      b1_element_op,
-      c_element_op);
-
-  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-  if (!op.IsSupportedArgument(arg_ptr.get())) {
-    std::ostringstream ostr;
-    ostr << op.GetTypeString() << " does not support this problem";
-
-    throw std::runtime_error(ostr.str());
-  }
-
-  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 9ba0d07a3..80f5f8aa5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -5,32 +5,47 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/sequence.hpp>
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type>
-void grouped_forward_mask_type_dispatched(
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void grouped_forward_masktype_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream);
 
 template <typename scalar_t>
 void grouped_forward(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    grouped_forward_mask_type_dispatched<scalar_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    grouped_forward_mask_type_dispatched<scalar_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    grouped_forward_mask_type_dispatched<scalar_t, 2>(param, stream);
-  else
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<scalar_t, 0, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<scalar_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<scalar_t, 1, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<scalar_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<scalar_t, 2, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<scalar_t, 2, false>(
+          param, stream);
+  } else
     throw std::runtime_error("Invalid custom_mask_type value");
 };
 
-template <typename scalar_t, int32_t custom_mask_type = 0>
-void grouped_forward_mask_type_dispatched(
+template <typename scalar_t, int32_t custom_mask_type = 0, bool has_attn_bias>
+void grouped_forward_masktype_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -44,7 +59,8 @@ void grouped_forward_mask_type_dispatched(
   using CDataType = scalar_t;
   using ZDataType = unsigned short;
   using LSEDataType = F32;
-  using Acc0BiasDataType = ck::Tuple<>;
+  using Acc0BiasDataType = typename std::
+      conditional<has_attn_bias, ck::Tuple<scalar_t>, ck::Tuple<>>::type;
   using Acc1BiasDataType = ck::Tuple<>;
 
   static constexpr ck::index_t NumDimG = 2;
@@ -76,7 +92,7 @@ void grouped_forward_mask_type_dispatched(
   static constexpr bool Deterministic = true;
 
   using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V1<
+      DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
           NumDimG,
           NumDimM,
           NumDimN,
@@ -108,7 +124,7 @@ void grouped_forward_mask_type_dispatched(
           128, // MPerBlock
           128, // NPerBlock
           32, // KPerBlock
-          32, // Gemm1NPerBlock
+          64, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           8, // AK1
           8, // BK1
@@ -117,7 +133,8 @@ void grouped_forward_mask_type_dispatched(
           32, // NPerXDL
           1, // MXdlPerWave
           4, // NXdlPerWave
-          1, // Gemm1NXdlPerWave
+          2, // Gemm1NXdlPerWave
+          1, // DropoutStep
           S<4, 64, 1>, // ABlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
@@ -132,25 +149,47 @@ void grouped_forward_mask_type_dispatched(
           8,
           8,
           true,
+          1,
           S<16, 16, 1>, // B1BlockTransfer
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,
-          2,
+          4,
           2,
           false,
           1, // CShuffleMXdlPerWavePerShuffle
-          1, // CShuffleNXdlPerWavePerShuffle
+          2, // CShuffleNXdlPerWavePerShuffle
           S<1,
-            64,
+            32,
             1,
-            4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
           8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          1,
           MaskingSpec, // MaskingSpecialization
           Deterministic>;
 
   std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
 
+  auto func_bias_lengths_strides = [&](int G1, int M, int N) {
+    if constexpr (has_attn_bias) {
+      std::vector<ck::index_t> d_gs_ms_ns_lengths{1, G1, M, N};
+      std::vector<ck::index_t> d_gs_ms_ns_strides{
+          0,
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2]};
+
+      auto bias_lengths_arr =
+          std::vector<std::vector<ck::index_t>>{d_gs_ms_ns_lengths};
+      auto bias_strides_arr =
+          std::vector<std::vector<ck::index_t>>{d_gs_ms_ns_strides};
+      return std::make_tuple(bias_lengths_arr, bias_strides_arr);
+    } else
+      return std::make_tuple(
+          std::vector<std::vector<ck::index_t>>{},
+          std::vector<std::vector<ck::index_t>>{});
+  };
+
   for (std::size_t i = 0; i < param.num_batches; i++) {
     int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
     int N = param.host_seqlen_k.empty()
@@ -187,6 +226,8 @@ void grouped_forward_mask_type_dispatched(
     std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
 
+    auto bias_lengths_strides = func_bias_lengths_strides(G1, M, N);
+
     problem_descs.push_back(
         {a_gs_ms_ks_lengths,
          a_gs_ms_ks_strides,
@@ -200,8 +241,8 @@ void grouped_forward_mask_type_dispatched(
          z_gs_ms_ns_strides,
          lse_gs_ms_lengths,
          lse_gs_ms_strides,
-         {}, // acc0_biases_gs_ms_ns_lengths
-         {}, // acc0_biases_gs_ms_ns_strides
+         std::get<0>(bias_lengths_strides),
+         std::get<1>(bias_lengths_strides),
          {}, // acc1_biases_gs_ms_os_lengths
          {}}); // acc1_biases_gs_ms_os_strides
   }
@@ -228,7 +269,7 @@ void grouped_forward_mask_type_dispatched(
       param.out_ptrs,
       param.randvals_ptrs,
       param.logsumexp_ptrs,
-      {}, // p_acc0_biases
+      std::vector<std::vector<const void*>>{param.attn_bias_ptrs},
       {}, // p_acc1_biases
       problem_descs,
       a_element_op,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
deleted file mode 100644
index 46bc95ece..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ /dev/null
@@ -1,225 +0,0 @@
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/sequence.hpp>
-
-#include "ck_fmha_util.h"
-
-template <typename scalar_t, int32_t custom_mask_typ>
-void grouped_infer_mask_type_dispatched(
-    GroupedInferParams& param,
-    hipStream_t stream);
-
-template <typename scalar_t>
-void grouped_infer(GroupedInferParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    grouped_infer_mask_type_dispatched<scalar_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    grouped_infer_mask_type_dispatched<scalar_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    grouped_infer_mask_type_dispatched<scalar_t, 2>(param, stream);
-  else
-    throw std::runtime_error("Invalid custom_mask_type value");
-};
-
-template <typename scalar_t, int32_t custom_mask_type = 0>
-void grouped_infer_mask_type_dispatched(
-    GroupedInferParams& param,
-    hipStream_t stream) {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using Acc0BiasDataType = ck::Tuple<>;
-  using Acc1BiasDataType = ck::Tuple<>;
-
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
-  using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
-          64, // Gemm1NPerBlock
-          32, // Gemm1KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
-          2, // Gemm1NXdlPerWave
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          8,
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          8,
-          8,
-          true,
-          S<16, 16, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          4,
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
-          2, // CShuffleNXdlPerWavePerShuffle
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
-          MaskingSpec>; // MaskingSpecialization
-
-  std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-  for (std::size_t i = 0; i < param.num_batches; i++) {
-    int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-    int N = param.host_seqlen_k.empty()
-        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-        : param.host_seqlen_k[i];
-    int K = param.K;
-    int Kv = param.Kv;
-    int G0 = 1;
-    int G1 = param.num_heads;
-
-    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
-    std::vector<ck::index_t> a_gs_ms_ks_strides{
-        0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
-    std::vector<ck::index_t> b0_gs_ns_ks_strides{
-        0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-    // to be changed to b1_gs_ns_os_lengths
-    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, Kv, N};
-    std::vector<ck::index_t> b1_gs_os_ns_strides = {
-        0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, Kv};
-    std::vector<ck::index_t> c_gs_ms_os_strides = {
-        0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-    problem_descs.push_back(
-        {a_gs_ms_ks_lengths,
-         a_gs_ms_ks_strides,
-         b0_gs_ns_ks_lengths,
-         b0_gs_ns_ks_strides,
-         b1_gs_os_ns_lengths,
-         b1_gs_os_ns_strides,
-         c_gs_ms_os_lengths,
-         c_gs_ms_os_strides,
-         {}, // acc0_biases_gs_ms_ns_lengths
-         {}, // acc0_biases_gs_ms_ns_strides
-         {}, // acc1_biases_gs_ms_os_lengths
-         {}}); // acc1_biases_gs_ms_os_strides
-  }
-
-  float alpha = param.scale;
-
-  auto a_element_op = AElementOp{};
-  auto b0_element_op = B0ElementOp{};
-  auto acc0_element_op = Acc0ElementOp{alpha};
-  auto b1_element_op = B1ElementOp{};
-  auto c_element_op = CElementOp{};
-
-  auto op = DeviceOpInstance{};
-  auto invoker = op.MakeInvoker();
-  auto arg_ptr = op.MakeArgumentPointer(
-      param.q_ptrs,
-      param.k_ptrs,
-      param.v_ptrs,
-      param.out_ptrs,
-      {}, // p_acc0_biases
-      {}, // p_acc1_biases
-      problem_descs,
-      a_element_op,
-      b0_element_op,
-      acc0_element_op,
-      b1_element_op,
-      c_element_op);
-
-  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-  if (!op.IsSupportedArgument(arg_ptr.get())) {
-    std::ostringstream ostr;
-
-    ostr << op.GetTypeString() << " does not support this problem";
-
-    throw std::runtime_error(ostr.str());
-  }
-
-  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 8606e6a93..32e3d0a7e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -187,6 +187,7 @@ struct BatchedInferParams {
   int Kv; // embed_dim for Value
 
   float scale;
+  bool has_attn_bias;
 
   // BMHK mode strides
   std::array<int, 4> q_strides;
@@ -206,15 +207,18 @@ struct BatchedInferParams {
 };
 
 struct BatchedForwardParams : public BatchedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
+
   float dropout_prob;
   at::PhiloxCudaState rng_engine_inputs;
 
-  // completely contiguous
-  void* logsumexp_ptr;
-
   // BHMN mode strides, completely contiguous
   std::array<int32_t, 4> randvals_strides;
   void* randvals_ptr;
+
+  // completely contiguous
+  void* logsumexp_ptr;
 };
 
 struct GroupedInferParams {
@@ -230,6 +234,7 @@ struct GroupedInferParams {
   std::vector<int> host_seqlen_k;
 
   float scale;
+  bool has_attn_bias;
 
   // MHK mode strides, last-dim contiguous
   std::array<int, 3> q_strides;
@@ -250,15 +255,18 @@ struct GroupedInferParams {
 };
 
 struct GroupedForwardParams : public GroupedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
+
   float dropout_prob;
   at::PhiloxCudaState rng_engine_inputs;
 
-  // completely contiguous
-  std::vector<void*> logsumexp_ptrs;
-
   // HMN mode strides, completely contiguous
   std::array<int, 3> randvals_strides;
   std::vector<void*> randvals_ptrs;
+
+  // completely contiguous
+  std::vector<void*> logsumexp_ptrs;
 };
 
 struct BatchedBackwardParams {

From b0398a170534d49040d0adefb32278078cb8b164 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 13 Aug 2023 17:12:01 +0000
Subject: [PATCH 011/641] Renaming the binding interfaces

---
 xformers/csrc/attention/attention.cpp                       | 4 ++++
 .../csrc/attention/hip_fmha/attention_backward_generic.cpp  | 6 +++---
 .../csrc/attention/hip_fmha/attention_forward_generic.cpp   | 6 +++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index f51c8f00e..ee0e07cc2 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -33,4 +33,8 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::_temp_dropout(Tensor out, float p) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 04a1ccf2b..2abd35b44 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -22,7 +22,7 @@
 
 namespace {
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-mem_efficient_attention_backward_hip(
+efficient_attention_backward_ck(
     const at::Tensor& grad_out,
     const at::Tensor& query,
     const at::Tensor& key,
@@ -368,6 +368,6 @@ mem_efficient_attention_backward_hip(
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_hip"),
-      TORCH_FN(mem_efficient_attention_backward_hip));
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
+      TORCH_FN(efficient_attention_backward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index f8baf6a8f..fc300e47d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -30,7 +30,7 @@ namespace {
   (Mode 1MHK) `batch=1` with all tokens across batches concatenated
 */
 std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
-efficient_attention_forward_hip(
+efficient_attention_forward_ck(
     const at::Tensor& query, // [b, seqlen, num_heads, K]
     const at::Tensor& key, // [b, seqlen, num_heads, K]
     const at::Tensor& value, // [b, seqlen, num_heads, Kv]
@@ -388,8 +388,8 @@ bool is_ck_fmha_available() {
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_hip"),
-      TORCH_FN(efficient_attention_forward_hip));
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
+      TORCH_FN(efficient_attention_forward_ck));
 }
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {

From 710b14a5e08f2681ed3ac510cad4fdde02ed460b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 14 Aug 2023 19:18:00 +0000
Subject: [PATCH 012/641] Some fix in ck_fmha_batched_forward.h

---
 .../hip_fmha/attention_backward_generic.cpp   |  7 -----
 .../hip_fmha/attention_forward_generic.cpp    | 27 -------------------
 .../hip_fmha/ck_fmha_batched_forward.h        |  7 +++--
 3 files changed, 5 insertions(+), 36 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 2abd35b44..c4eb660de 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -1,10 +1,3 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
 #include <cmath>
 
 #include <ATen/Context.h>
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index fc300e47d..25afc5b07 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -1,10 +1,3 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
 #include <cmath>
 #include <mutex>
 
@@ -47,12 +40,6 @@ efficient_attention_forward_ck(
     int64_t custom_mask_type,
     c10::optional<double> scale,
     const c10::optional<at::Tensor>& seqlen_k) {
-#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD
-  TORCH_CHECK(
-      false,
-      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD");
-#else
-
   TORCH_CHECK(query.dim() == 4);
   TORCH_CHECK(key.dim() == 4);
   TORCH_CHECK(value.dim() == 4);
@@ -375,15 +362,8 @@ efficient_attention_forward_ck(
   std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
 
   return std::make_tuple(out, logsumexp, seed, offset);
-#endif
 }
 
-// For testing in xFormers
-bool is_ck_fmha_available() {
-  std::cout << "ck fmha is really here!" << std::endl;
-  return (true);
-};
-
 } // namespace
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
@@ -391,10 +371,3 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
       TORCH_FN(efficient_attention_forward_ck));
 }
-
-TORCH_LIBRARY_FRAGMENT(xformers, m) {
-  m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available() -> bool"));
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
-      TORCH_FN(is_ck_fmha_available));
-}
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index f2fb0a69d..8c2c8f046 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -185,7 +185,7 @@ void batched_forward_masktype_attnbias_dispatched(
 
   // to be changed to b1_gs_ns_os_lengths
   std::vector<ck::index_t> b1_gs_os_ns_lengths{
-      param.B, param.num_heads, param.N, param.Kv};
+      param.B, param.num_heads, param.Kv, param.N};
   std::vector<ck::index_t> b1_gs_os_ns_strides{
       param.v_strides[0],
       param.v_strides[2],
@@ -210,6 +210,9 @@ void batched_forward_masktype_attnbias_dispatched(
         param.randvals_strides[1],
         param.randvals_strides[2],
         param.randvals_strides[3]};
+  } else {
+    z_gs_ms_ns_lengths = {1, 1, 1, 1};
+    z_gs_ms_ns_strides = {0, 0, 0, 0};
   };
 
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
@@ -283,7 +286,7 @@ void batched_forward_masktype_attnbias_dispatched(
       acc0_element_op,
       b1_element_op,
       c_element_op,
-      param.dropout_prob, // dropout ratio
+      param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
       {seed, offset}); // dropout random seed and offset, offset should be at
                        // least the number of elements on a thread
 

From c3d0fdf2d718a0efc65de0c0e9d15207fb6934b8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 14 Aug 2023 19:20:26 +0000
Subject: [PATCH 013/641] xforemer fmha ops for ck

---
 xformers/ops/__init__.py      |   2 +
 xformers/ops/fmha/__init__.py |   5 +-
 xformers/ops/fmha/ck.py       | 383 ++++++++++++++++++++++++++++++++++
 xformers/ops/fmha/common.py   |   2 +-
 4 files changed, 389 insertions(+), 3 deletions(-)
 create mode 100644 xformers/ops/fmha/ck.py

diff --git a/xformers/ops/__init__.py b/xformers/ops/__init__.py
index e2ddbfb8d..d14468c2b 100644
--- a/xformers/ops/__init__.py
+++ b/xformers/ops/__init__.py
@@ -17,6 +17,7 @@
     MemoryEfficientAttentionOp,
     MemoryEfficientAttentionTritonFwdFlashBwOp,
     TritonFlashAttentionOp,
+    MemoryEfficientAttentionCkOp,
     memory_efficient_attention,
     memory_efficient_attention_backward,
     memory_efficient_attention_forward,
@@ -73,6 +74,7 @@ def masked_matmul(a, b, mask=None):
     "MemoryEfficientAttentionFlashAttentionOp",
     "MemoryEfficientAttentionOp",
     "MemoryEfficientAttentionTritonFwdFlashBwOp",
+    "MemoryEfficientAttentionCkOp",
     "memory_efficient_attention_backward",
     "memory_efficient_attention_forward",
     "memory_efficient_attention_forward_requires_grad",
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 2101eaa6b..5d672ef6f 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from . import cutlass, flash, small_k, triton
+from . import cutlass, flash, small_k, triton, ck
 from .attn_bias import AttentionBias, BlockDiagonalMask, LowerTriangularMask
 from .common import (
     AttentionBwOpBase,
@@ -28,7 +28,7 @@
 MemoryEfficientAttentionFlashAttentionOp = (flash.FwOp, flash.BwOp)
 MemoryEfficientAttentionOp = (small_k.FwOp, small_k.BwOp)
 TritonFlashAttentionOp = (triton.FwOp, triton.BwOp)
-
+MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp) 
 
 class _fMHA(torch.autograd.Function):
     @staticmethod
@@ -396,4 +396,5 @@ def _memory_efficient_attention_backward(
     "MemoryEfficientAttentionOp",
     "TritonFlashAttentionOp",
     "memory_efficient_attention",
+    "MemoryEfficientAttentionCkOp",
 ]
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
new file mode 100644
index 000000000..9cac79d76
--- /dev/null
+++ b/xformers/ops/fmha/ck.py
@@ -0,0 +1,383 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from enum import Enum
+from typing import Any, List, Mapping, Optional, Set, Tuple, Union
+
+import torch
+
+from ..common import get_xformers_operator, register_operator
+from . import attn_bias
+from .attn_bias import (
+    AttentionBias,
+    BlockDiagonalCausalMask,
+    BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    BlockDiagonalMask,
+    LowerTriangularMask,
+    LowerTriangularMaskWithTensorBias,
+)
+from .common import (
+    AttentionBwOpBase,
+    AttentionFwOpBase,
+    Context,
+    Gradients,
+    Inputs,
+    check_lastdim_alignment_stride1,
+)
+
+def _minimum_gemm_alignment(inp: Inputs) -> int:
+    if inp.device.type != "cuda":
+        return 1
+    bits_per_scalar = {torch.float: 32, torch.half: 16, torch.bfloat16: 16}[
+        inp.query.dtype
+    ]
+    ## for MI200/MI300 only
+    uses_tensorcores = True 
+    matmul_alignment_mn = 4 
+    if uses_tensorcores:
+        matmul_alignment_mn = max(matmul_alignment_mn, 128 // bits_per_scalar)
+    return matmul_alignment_mn
+
+
+def _get_seqlen_info(
+    inp: Inputs,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    attn_bias = inp.attn_bias
+    if isinstance(
+        attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+    ):
+        attn_bias.k_seqinfo.to(inp.query.device)
+        attn_bias.q_seqinfo.to(inp.query.device)
+        seqstart_k = attn_bias.k_seqinfo.seqstart
+        seqstart_q = attn_bias.q_seqinfo.seqstart
+        ##max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
+        ##max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
+    else:
+        seqstart_k = None
+        seqstart_q = None
+        ##max_seqlen_q = -1
+        ##max_seqlen_k = -1
+
+    return seqstart_k, seqstart_q
+
+
+def _get_tensor_bias(
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
+) -> Optional[torch.Tensor]:
+    if isinstance(attn_bias, torch.Tensor):
+        return attn_bias
+    elif isinstance(attn_bias, LowerTriangularMaskWithTensorBias):
+        return attn_bias._bias
+    return None
+
+
+def _check_bias_alignment(
+    reasons: List[str], attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
+) -> None:
+    attn_bias_tensor = _get_tensor_bias(attn_bias)
+    if attn_bias_tensor is not None:
+        alignment = 128 // torch.finfo(attn_bias_tensor.dtype).bits
+        show_padding_hint = False
+        for d in range(attn_bias_tensor.ndim - 1):
+            if attn_bias_tensor.stride(d) % alignment != 0:
+                reasons.append(
+                    f"attn_bias.stride(-2) % {alignment} != 0 (attn_bias.stride() = {attn_bias_tensor.stride()})"
+                )
+                show_padding_hint = True
+        if show_padding_hint:
+            reasons.append(
+                """\
+HINT: To use an `attn_bias` with a sequence length that is not a multiple of 8, \
+you need to ensure memory is aligned by slicing a bigger tensor. \
+Example: use `attn_bias = torch.zeros([1, 1, 5, 8])[:,:,:,:5]` instead of `torch.zeros([1, 1, 5, 5])`"""
+            )
+        # We can have stride=0 sometimes if dimension=1
+        if attn_bias_tensor.stride(-1) > 1:
+            reasons.append(
+                f"attn_bias.stride(-1) > 1 (attn_bias.stride() = {attn_bias_tensor.stride()}) - "
+                "you should call `.contiguous()` on the bias"
+            )
+
+
+class _CustomMaskType(int, Enum):
+    """
+    (Matches CustomMaskType in C++.)
+    """
+
+    NoCustomMask = 0
+    CausalFromTopLeft = 1
+    CausalFromBottomRight = 2
+
+
+def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int:
+    if isinstance(
+        bias,
+        (
+            LowerTriangularMask,
+            BlockDiagonalCausalMask,
+        ),
+    ):
+        return int(_CustomMaskType.CausalFromTopLeft)
+    if isinstance(
+        bias,
+        (
+            attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        ),
+    ):
+        return int(_CustomMaskType.CausalFromBottomRight)
+    return int(_CustomMaskType.NoCustomMask)
+
+
+@register_operator
+class FwOp(AttentionFwOpBase):
+    """xFormers' MHA kernel based on Composable Kernel.
+    Supports AMD MI 200 and MI 300 GPUs 
+    """
+
+    OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
+    SUPPORTED_DEVICES: Set[str] = {"cuda"}
+    SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
+    SUPPORTED_MAX_K = 65536
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        torch.Tensor,
+        LowerTriangularMask,
+        LowerTriangularMaskWithTensorBias,
+        BlockDiagonalMask,
+        BlockDiagonalCausalMask,
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    }
+    SUPPORTS_DROPOUT = True
+    SUPPORTS_CUSTOM_SCALE = True
+    SUPPORTS_DIFFERENT_VALUE_EMBED = True
+    NAME = "ckF"
+
+    _TEST_K: List[int] = [
+        32,  # 64x64 kernel
+        128,  # 64x128 kernel
+        256,  # 64x128 with accumulation in gmem
+    ]
+
+    @classmethod
+    def apply(
+        cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
+            raise NotImplementedError("Unsupported attn_bias type")
+        seqstart_k, seqstart_q  = _get_seqlen_info(inp)
+        out, lse, rng_seed, rng_offset = cls.OPERATOR(
+            query=inp.query,
+            key=inp.key,
+            value=inp.value,
+            attn_bias=_get_tensor_bias(inp.attn_bias),
+            seqstart_q=seqstart_q,
+            seqstart_k=seqstart_k,
+            dropout_p=inp.p,
+            compute_logsumexp=needs_gradient,
+            custom_mask_type=_custom_mask_type(inp.attn_bias),
+            scale=inp.scale,
+            seqlen_k=inp.attn_bias.k_seqinfo.seqlen
+            if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+            else None,
+        )
+        ctx: Optional[Context] = None
+        if needs_gradient:
+            ctx = Context(
+                out=out,
+                lse=lse,
+                # cutlass forward is only compatible with cutlass backward if
+                # dropout is used (because of the way RNG states are passed and the
+                # way random numbers are generated during backward)
+                op_bw=BwOp if inp.p != 0 else None,
+            )
+            if inp.p != 0:
+                ctx.rng_state = torch.tensor(
+                    [rng_seed, rng_offset], dtype=torch.int64, device="cpu"
+                )
+        return out, ctx
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(FwOp, cls).not_supported_reasons(d)
+        matmul_alignment_mn = _minimum_gemm_alignment(d)
+        check_lastdim_alignment_stride1(reasons, "query", d.query, matmul_alignment_mn)
+        check_lastdim_alignment_stride1(reasons, "value", d.value, matmul_alignment_mn)
+        _check_bias_alignment(reasons, d.attn_bias)
+        return reasons
+
+    @classmethod
+    # type: ignore
+    def operator_flop(
+        cls,
+        q,
+        k,
+        v,
+        b,
+        seqstart_q,
+        seqstart_k,
+        compute_lse,
+        custom_mask_type,
+        *a,
+    ) -> int:
+        return cls.attn_operator_flop(
+            q,
+            k,
+            v,
+            causal=custom_mask_type > 0,
+            seqstart_k=seqstart_k,
+            seqstart_q=seqstart_q,
+        )
+
+
+@register_operator
+class BwOp(AttentionBwOpBase):
+    __doc__ = FwOp.__doc__
+
+    OPERATOR = get_xformers_operator("efficient_attention_backward_ck")
+    SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
+    SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES
+    SUPPORTED_MAX_K = FwOp.SUPPORTED_MAX_K
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        torch.Tensor,
+        LowerTriangularMask,
+        # TODO: Fix handling of gradient through the fMHA autograd function
+        # LowerTriangularMaskWithTensorBias,
+        BlockDiagonalMask,
+        BlockDiagonalCausalMask,
+        attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    }
+    SUPPORTS_ATTN_BIAS_GRAD = True
+    SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
+    SUPPORTS_CUSTOM_SCALE = FwOp.SUPPORTS_CUSTOM_SCALE
+    SUPPORTS_DIFFERENT_VALUE_EMBED = FwOp.SUPPORTS_DIFFERENT_VALUE_EMBED
+    NAME = "ckB"
+
+    ERROR_ATOL: Mapping[torch.dtype, float] = {
+        torch.float: 5e-4,
+        # increased from 9e-2, more opportunities for numerical errors when bias is
+        # used, noticed in gK on SM80
+        torch.half: 1e-1,
+        torch.bfloat16: 7e-1,
+    }
+
+    _TEST_K: List[int] = [
+        32,  # 64x64 kernel
+        128,  # 64x128/128x128 kernel
+        256,  # 64x128 with accumulation in gmem
+    ]
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(BwOp, cls).not_supported_reasons(d)
+        matmul_alignment_mn = _minimum_gemm_alignment(d)
+
+        check_lastdim_alignment_stride1(reasons, "query", d.query, matmul_alignment_mn)
+        check_lastdim_alignment_stride1(reasons, "key", d.key, matmul_alignment_mn)
+        check_lastdim_alignment_stride1(reasons, "value", d.value, matmul_alignment_mn)
+        _check_bias_alignment(reasons, d.attn_bias)
+        attn_bias_tensor = _get_tensor_bias(d.attn_bias)
+
+        # Backprop of gradient through broadcasted bias is not supported
+        if attn_bias_tensor is not None and attn_bias_tensor.requires_grad:
+            # Don't forget that inputs are either in BMK or BMHK!
+            if d.query.ndim == 3 and attn_bias_tensor.ndim == 3:
+                expected_bias_shape = (*d.query.shape[:2], d.key.shape[1])
+            else:
+                # bias is B H Mq Mk
+                expected_bias_shape = (
+                    d.query.shape[0],
+                    d.query.shape[2] if d.query.ndim == 4 else 1,
+                    d.query.shape[1],
+                    d.key.shape[1],
+                )
+            if tuple(attn_bias_tensor.shape) != expected_bias_shape:
+                reasons.append(
+                    "Broadcasting the `attn_bias` tensor is not supported "
+                    f"(shape: {tuple(attn_bias_tensor.shape)}"
+                    f"/ expected: {expected_bias_shape})"
+                )
+        return reasons
+
+    @classmethod
+    def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
+        if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
+            raise NotImplementedError("Unsupported attn_bias type")
+
+        seqstart_k, seqstart_q = _get_seqlen_info(inp)
+        dtype = inp.query.dtype
+
+        rng_seed = rng_offset = 0
+        if inp.p != 0.0:
+            if (
+                ctx.rng_state is None
+                or ctx.rng_state.dtype != torch.int64
+                or ctx.rng_state.device.type != "cpu"
+                or ctx.rng_state.shape != (2,)
+            ):
+                raise NotImplementedError(f"Invalid rng_state: {ctx.rng_state}")
+            rng_seed, rng_offset = ctx.rng_state.tolist()
+
+        force_pad_inf = torch.cuda.get_device_capability(inp.query.device) == (7, 5)
+        (grad_q, grad_k, grad_v, grad_bias) = cls.OPERATOR(
+            grad.to(dtype),
+            inp.query,
+            inp.key,
+            inp.value,
+            _get_tensor_bias(inp.attn_bias),
+            cu_seqlens_q=seqstart_q,
+            cu_seqlens_k=seqstart_k,
+            logsumexp=ctx.get_padded_lse(32, force_pad_inf=force_pad_inf),
+            output=ctx.out.to(dtype),
+            dropout_p=inp.p,
+            # if not using dropout, seed and offset are irrelevant but still expected
+            # in function signature so just pass 0
+            # seed and offset could be None if a different FW op other than cutlass
+            # was used.
+            rng_seed=rng_seed,
+            rng_offset=rng_offset,
+            custom_mask_type=_custom_mask_type(inp.attn_bias),
+            scale=inp.scale,
+        )
+
+        # c++/CUDA implementation returns an uninitialized tensor if bias doesn't
+        # require grad
+        if not (
+            isinstance(inp.attn_bias, torch.Tensor) and inp.attn_bias.requires_grad
+        ):
+            grad_bias = None
+
+        return Gradients(dq=grad_q, dk=grad_k, dv=grad_v, db=grad_bias)
+
+    @classmethod
+    # type: ignore
+    def operator_flop(
+        cls,
+        dO,
+        q,
+        k,
+        v,
+        b,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        logsumexp,
+        output,
+        dropout_p,
+        rng_seed,
+        rng_offset,
+        custom_mask_type,
+        scale,
+    ) -> int:
+        return cls.attn_operator_flop(
+            q,
+            k,
+            v,
+            seqstart_q=cu_seqlens_q,
+            seqstart_k=cu_seqlens_k,
+            causal=custom_mask_type > 0,
+        )
diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index c9c599da6..d537d71e4 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -172,7 +172,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         dtype = d.query.dtype
         if device_type not in cls.SUPPORTED_DEVICES:
             reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
-        if device_type == "cuda" and not _built_with_cuda:
+        if device_type == "cuda" and not _built_with_cuda and (torch.version.hip is None):
             reasons.append("xFormers wasn't build with CUDA support")
         if dtype not in cls.SUPPORTED_DTYPES:
             reasons.append(f"dtype={dtype} (supported: {cls.SUPPORTED_DTYPES})")

From 2cde6d2290f6641e22a5f915e813f682c65b743c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 14 Aug 2023 19:22:09 +0000
Subject: [PATCH 014/641] Add several very simple testing for ck flashAttention

---
 tests/test_ck_1.py |  33 +++
 tests/test_ck_2.py | 558 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_ck_3.py | 562 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1153 insertions(+)
 create mode 100644 tests/test_ck_1.py
 create mode 100644 tests/test_ck_2.py
 create mode 100644 tests/test_ck_3.py

diff --git a/tests/test_ck_1.py b/tests/test_ck_1.py
new file mode 100644
index 000000000..b5dba2d21
--- /dev/null
+++ b/tests/test_ck_1.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+
+import pytest
+import torch
+
+from xformers.ops.common import get_xformers_operator
+
+B = 7
+M = 1000
+N = 1000
+H = 13
+K = 64
+Kv = 64
+
+_types = [torch.float16, torch.bfloat16]
+
+@pytest.mark.parametrize("test_type", _types)
+def test_types(test_type):
+    query = torch.rand((B, M, H, K), device=torch.device("cuda"), dtype=test_type)
+    key = torch.rand((B, N, H, K), device=torch.device("cuda"), dtype=test_type)
+    val = torch.rand((B, N, H, Kv), device=torch.device("cuda"), dtype=test_type)
+
+    Operator=get_xformers_operator("efficient_attention_forward_ck")
+
+    out, lse, rng_seed, rng_offset = Operator(query=query, key=key, value=val, attn_bias=None, seqstart_q=None, seqstart_k=None, dropout_p=0.0, compute_logsumexp=False, custom_mask_type=0, scale=None, seqlen_k=None)
+
+    print(rng_seed)
+
diff --git a/tests/test_ck_2.py b/tests/test_ck_2.py
new file mode 100644
index 000000000..5382ba5bf
--- /dev/null
+++ b/tests/test_ck_2.py
@@ -0,0 +1,558 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+
+## need to FIX
+##from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+_devices = ["cuda"] if torch.cuda.is_available() else ["cpu"]
+_types = [torch.float16, torch.bfloat16]
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
+    fmha.ck.BwOp,
+]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    for B in op._TEST_BATCH_SIZES:
+        for Mq in [32, 256]:
+            for Mkv in [32, 64, 256]:
+                for K in op._TEST_K:
+                    shapes.append((B, Mq, Mkv, 1, K, K))
+        Mq = 256
+        Mkv = 128
+        K = 32
+        H = 1
+        # Weird values of parameters
+        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
+            shapes.append((B, M, Mkv, H, K, K))
+            shapes.append((B, Mq, M, H, K, K))
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 256 + 2, 256 + 8, 512]:
+            if _K <= op.SUPPORTED_MAX_K:
+                shapes.append((B, Mq, Mkv, H, _K, _K))
+        # Different value for K / Kv
+        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+            for _K in [32, 36, 64, 256 + 8]:
+                shapes.append((B, Mq, Mkv, H, K, _K))
+                shapes.append((B, Mq, Mkv, H, _K, K))
+        # Exotic sizes
+        for K in op._TEST_K:
+            shapes.append((B, 16, 1024, H, K, K))
+            shapes.append((B, 1024, 16, H, K, K))
+        # Some number of heads
+        for H in [3, 5, 12]:
+            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        for _ in range(20):
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(
+            sorted(list(op.SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
+        )
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+    return {
+        "argvalues": combination,
+        "ids": ids,
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
+)
+
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+
+            # make sure it also works if the first columns are partially masked out
+            attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+'''
+def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
+    tensor_with_grad: Optional[torch.Tensor] = None
+    if isinstance(attn_bias, torch.Tensor):
+        tensor_with_grad = attn_bias
+    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+        tensor_with_grad = attn_bias._bias
+    if tensor_with_grad is not None:
+        grad = tensor_with_grad.grad
+        if clear:
+            tensor_with_grad.grad = None
+        return grad
+    return None
+'''
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+@pytest.mark.parametrize("k_len", [32, 64])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("kv_len", [128, 512])
+@pytest.mark.parametrize("q_len", [128, 512])
+@pytest.mark.parametrize("device", _devices)
+@pytest.mark.parametrize("test_type", _types)
+def test_key_query_all_ones(test_type, device, q_len, kv_len, batch_size, k_len):
+    scale = 3
+    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=test_type)
+    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=test_type)
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=test_type) * scale
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
+    # this should be equivalent to the average over value
+    ref = value.mean(1, keepdim=True).expand_as(query)
+
+    if test_type is torch.float16:
+        assert_allclose(out, ref, atol=1e-5)
+    else:
+        assert_allclose(out, ref, atol=1e-2)
+
+
diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
new file mode 100644
index 000000000..9b790c743
--- /dev/null
+++ b/tests/test_ck_3.py
@@ -0,0 +1,562 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+
+## need to FIX
+##from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from tests.utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        for _ in range(20):
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(
+            sorted(list(op.SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
+        )
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+    return {
+        "argvalues": combination,
+        "ids": ids,
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+
+            # make sure it also works if the first columns are partially masked out
+            attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+'''
+SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        torch.Tensor,
+        LowerTriangularMask,
+        LowerTriangularMaskWithTensorBias,
+        BlockDiagonalMask,
+        BlockDiagonalCausalMask,
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        attn_bias.BlockDiagonalCausalFromBottomRightMask,
+'''
+
+@pytest.mark.parametrize("packed", [False, True])
+@pytest.mark.parametrize("fmt", ["BMHK"])
+def test_forward(fmt, packed):
+    op = fmha.ck.FwOp
+    device = torch.device("cuda")
+    dtype = torch.float16 
+    bias_type = fmha.attn_bias.LowerTriangularMask
+    batch_size = 7 
+    q_len = 1000
+    kv_len = 1000
+    h = 3
+    k = 64
+    kv = 64
+
+    if packed and not (k == kv and q_len == kv_len):
+        pytest.skip(
+            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
+        )
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
+        pytest.skip("BMK incompatible with this bias")
+
+    query, key, value, attn_bias = create_tensors(
+        op, device, dtype, bias_type, batch_size, q_len, kv_len, h, k, kv, fmt="BMHK" if packed else fmt
+    )
+
+    if packed:
+        c = torch.stack([query, key, value], 2)
+        if fmt == "BMK":
+            # bm3hk -> 3bhmk -> 3Bmk
+            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            query, key, value = c[0], c[1], c[2]
+            # Re-create bias in the right format
+            attn_bias = create_attn_bias(
+                bias_type=bias_type,
+                batch_size=batch_size,
+                num_heads=h,
+                q_len=q_len,
+                kv_len=kv_len,
+                device=device,
+                dtype=dtype,
+                requires_grad=False,
+                fmt=fmt,
+                op=op,
+            )
+        else:
+            # bm3hk -> 3 x bmhk
+            query, key, value = xformers.ops.unbind(c, 2)
+        assert not query.is_contiguous()
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+

From 674b4574d5dce4ea4f8624a96aba51a947401658 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 15 Aug 2023 12:20:07 +0000
Subject: [PATCH 015/641] Update to synchronize with the change in CK
 FlashAttentin forward for simplifying the interfaces

---
 .../hip_fmha/ck_fmha_batched_forward.h        | 56 ++++++++-----------
 .../hip_fmha/ck_fmha_grouped_forward.h        | 54 ++++++++----------
 .../csrc/attention/hip_fmha/ck_fmha_test.cpp  | 21 +++++++
 3 files changed, 68 insertions(+), 63 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 8c2c8f046..eb7c85bb1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -58,9 +58,9 @@ void batched_forward_masktype_attnbias_dispatched(
   using CDataType = scalar_t;
   using ZDataType = unsigned short;
   using LSEDataType = F32;
-  using Acc0BiasDataType = typename std::
-      conditional<has_attn_bias, ck::Tuple<scalar_t>, ck::Tuple<>>::type;
-  using Acc1BiasDataType = ck::Tuple<>;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
 
   static constexpr ck::index_t NumDimG = 2;
   static constexpr ck::index_t NumDimM = 1;
@@ -217,28 +217,20 @@ void batched_forward_masktype_attnbias_dispatched(
 
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
-  auto bias_ptr_lengths_strides = [&]() {
-    if constexpr (has_attn_bias) {
-      auto bias_ptr_arr =
-          std::array<void*, 1>{const_cast<void*>(param.attn_bias_ptr)};
-      std::vector<ck::index_t> d_gs_ms_ns_lengths{
-          param.B, param.num_heads, param.M, param.N};
-      std::vector<ck::index_t> d_gs_ms_ns_strides{
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-      auto bias_lengths_arr =
-          std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths};
-      auto bias_strides_arr =
-          std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides};
-      return std::make_tuple(bias_ptr_arr, bias_lengths_arr, bias_strides_arr);
-    } else
-      return std::make_tuple(
-          std::array<void*, 0>{},
-          std::array<std::vector<ck::index_t>, 0>{},
-          std::array<std::vector<ck::index_t>, 0>{});
-  }();
+  std::vector<ck::index_t> d_gs_ms_ns_lengths;
+  std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+  if constexpr (has_attn_bias) {
+    d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+    d_gs_ms_ns_strides = {
+        param.attn_bias_strides[0],
+        param.attn_bias_strides[1],
+        param.attn_bias_strides[2],
+        param.attn_bias_strides[3]};
+  } else {
+    d_gs_ms_ns_lengths = {1, 1, 1, 1};
+    d_gs_ms_ns_strides = {0, 0, 0, 0};
+  };
 
   float alpha = param.scale;
 
@@ -262,8 +254,8 @@ void batched_forward_masktype_attnbias_dispatched(
       param.out_ptr,
       param.randvals_ptr,
       param.logsumexp_ptr,
-      std::get<0>(bias_ptr_lengths_strides),
-      {}, // std::array<void*, 1> p_acc1_biases;
+      param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+      {}, // p_acc1_biases;
       a_gs_ms_ks_lengths,
       a_gs_ms_ks_strides,
       b0_gs_ns_ks_lengths,
@@ -275,12 +267,10 @@ void batched_forward_masktype_attnbias_dispatched(
       z_gs_ms_ns_lengths,
       z_gs_ms_ns_strides,
       lse_gs_ms_lengths,
-      std::get<1>(bias_ptr_lengths_strides),
-      std::get<2>(bias_ptr_lengths_strides),
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc1_biases_gs_ms_os_lengths},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc1_biases_gs_ms_os_strides},
+      d_gs_ms_ns_lengths,
+      d_gs_ms_ns_strides,
+      {}, // acc1_biases_gs_ms_os_lengths
+      {}, // acc1_biases_gs_ms_os_strides,
       a_element_op,
       b0_element_op,
       acc0_element_op,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 80f5f8aa5..3e9fc813f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -59,9 +59,9 @@ void grouped_forward_masktype_attnbias_dispatched(
   using CDataType = scalar_t;
   using ZDataType = unsigned short;
   using LSEDataType = F32;
-  using Acc0BiasDataType = typename std::
-      conditional<has_attn_bias, ck::Tuple<scalar_t>, ck::Tuple<>>::type;
-  using Acc1BiasDataType = ck::Tuple<>;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
 
   static constexpr ck::index_t NumDimG = 2;
   static constexpr ck::index_t NumDimM = 1;
@@ -170,26 +170,6 @@ void grouped_forward_masktype_attnbias_dispatched(
 
   std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
 
-  auto func_bias_lengths_strides = [&](int G1, int M, int N) {
-    if constexpr (has_attn_bias) {
-      std::vector<ck::index_t> d_gs_ms_ns_lengths{1, G1, M, N};
-      std::vector<ck::index_t> d_gs_ms_ns_strides{
-          0,
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2]};
-
-      auto bias_lengths_arr =
-          std::vector<std::vector<ck::index_t>>{d_gs_ms_ns_lengths};
-      auto bias_strides_arr =
-          std::vector<std::vector<ck::index_t>>{d_gs_ms_ns_strides};
-      return std::make_tuple(bias_lengths_arr, bias_strides_arr);
-    } else
-      return std::make_tuple(
-          std::vector<std::vector<ck::index_t>>{},
-          std::vector<std::vector<ck::index_t>>{});
-  };
-
   for (std::size_t i = 0; i < param.num_batches; i++) {
     int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
     int N = param.host_seqlen_k.empty()
@@ -226,7 +206,21 @@ void grouped_forward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
 
-    auto bias_lengths_strides = func_bias_lengths_strides(G1, M, N);
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {1, G1, M, N};
+      d_gs_ms_ns_strides = {
+          0,
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2]};
+
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    };
 
     problem_descs.push_back(
         {a_gs_ms_ks_lengths,
@@ -241,10 +235,10 @@ void grouped_forward_masktype_attnbias_dispatched(
          z_gs_ms_ns_strides,
          lse_gs_ms_lengths,
          lse_gs_ms_strides,
-         std::get<0>(bias_lengths_strides),
-         std::get<1>(bias_lengths_strides),
-         {}, // acc1_biases_gs_ms_os_lengths
-         {}}); // acc1_biases_gs_ms_os_strides
+         d_gs_ms_ns_lengths,
+         d_gs_ms_ns_strides,
+         {}, // acc1_bias_gs_ms_os_lengths
+         {}}); // acc1_bias_gs_ms_os_strides
   }
 
   // TODO, how to initialize seed, offset
@@ -269,7 +263,7 @@ void grouped_forward_masktype_attnbias_dispatched(
       param.out_ptrs,
       param.randvals_ptrs,
       param.logsumexp_ptrs,
-      std::vector<std::vector<const void*>>{param.attn_bias_ptrs},
+      param.attn_bias_ptrs,
       {}, // p_acc1_biases
       problem_descs,
       a_element_op,
@@ -277,7 +271,7 @@ void grouped_forward_masktype_attnbias_dispatched(
       acc0_element_op,
       b1_element_op,
       c_element_op,
-      param.dropout_prob, // dropout ratio
+      param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
       {seed, offset});
 
   SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
new file mode 100644
index 000000000..1b451b5f9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
@@ -0,0 +1,21 @@
+#include <iostream>
+
+#include <torch/library.h>
+
+namespace {
+
+// For testing xFormers building and binding
+bool is_ck_fmha_available(double val) {
+  std::cout << "ck fmha is really here, val=" << val << std::endl;
+  return (true);
+};
+
+} // namespace
+
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::is_ck_fmha_available(float val) -> bool"));
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
+      TORCH_FN(is_ck_fmha_available));
+}

From 121f4a2e0c8dda0eda086c081878d63bd45aa805 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 15 Aug 2023 21:27:47 +0000
Subject: [PATCH 016/641] Update to use vector size 1 to enable all A/B/B1/C
 sizes for testing

---
 .../hip_fmha/attention_forward_generic.cpp        |  2 ++
 .../attention/hip_fmha/ck_fmha_batched_forward.h  | 15 ++++++++++-----
 .../attention/hip_fmha/ck_fmha_grouped_forward.h  | 15 ++++++++++-----
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 25afc5b07..920ec43aa 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -348,11 +348,13 @@ efficient_attention_forward_ck(
     if (!seqstart_q.has_value()) { // input is batched
       BatchedForwardParams batched_forward_params;
 
+      std::cout << " -------- call batched_forward ---------" << std::endl; 
       set_batched_forward_params(batched_forward_params);
       batched_forward<scalar_t>(batched_forward_params, stream);
     } else { // input is grouped
       GroupedForwardParams grouped_forward_params;
 
+      std::cout << " -------- call grouped_forward ---------" << std::endl; 
       set_grouped_forward_params(grouped_forward_params);
       grouped_forward<scalar_t>(grouped_forward_params, stream);
     }
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index eb7c85bb1..5cb94229d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -90,6 +90,11 @@ void batched_forward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
+  // Tunables
+  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
   using DeviceOpInstance = ck::tensor_operation::device::
       DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
           NumDimG,
@@ -138,22 +143,22 @@ void batched_forward_masktype_attnbias_dispatched(
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
           S<4, 64, 1>, // BBlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
-          4,
+          Acc0BiasTransferSrcScalarPerVector, // TUNABLE
           S<16, 16, 1>, // B1BlockTransfer
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,
-          4,
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
           2,
           false,
           1, // CShuffleMXdlPerWavePerShuffle
@@ -162,7 +167,7 @@ void batched_forward_masktype_attnbias_dispatched(
             32,
             1,
             8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
           4,
           MaskingSpec, // MaskingSpecialization
           Deterministic>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 3e9fc813f..97efabfe5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -91,6 +91,11 @@ void grouped_forward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = true;
 
+  // Tunables
+  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
   using DeviceOpInstance = ck::tensor_operation::device::
       DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
           NumDimG,
@@ -139,22 +144,22 @@ void grouped_forward_masktype_attnbias_dispatched(
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
           S<4, 64, 1>, // BBlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
-          1,
+          Acc0BiasTransferSrcScalarPerVector,
           S<16, 16, 1>, // B1BlockTransfer
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,
-          4,
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
           2,
           false,
           1, // CShuffleMXdlPerWavePerShuffle
@@ -163,7 +168,7 @@ void grouped_forward_masktype_attnbias_dispatched(
             32,
             1,
             8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          8, // CShuffleBlockTransferScalarPerVector_NPerBlock
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
           1,
           MaskingSpec, // MaskingSpecialization
           Deterministic>;

From 091e73960125ad2f98184b7783607fda2edecef6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 15 Aug 2023 21:29:22 +0000
Subject: [PATCH 017/641] Add test_ck_4.py which passed the BMHK for four mask
 situations(none,Biastensor,LowerTriangular,LowerTriangularWithBiasTensor)

---
 tests/test_ck_4.py | 581 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 581 insertions(+)
 create mode 100644 tests/test_ck_4.py

diff --git a/tests/test_ck_4.py b/tests/test_ck_4.py
new file mode 100644
index 000000000..ed58804c2
--- /dev/null
+++ b/tests/test_ck_4.py
@@ -0,0 +1,581 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Any, Set
+
+import pytest
+import torch
+
+## need to FIX
+##from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+from xformers.ops.fmha.attn_bias import (
+    AttentionBias,
+    BlockDiagonalCausalMask,
+    BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    BlockDiagonalMask,
+    BlockDiagonalCausalFromBottomRightMask,
+    LowerTriangularMask,
+    LowerTriangularMaskWithTensorBias,
+)
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+_devices = ["cuda"] if torch.cuda.is_available() else ["cpu"]
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        for _ in range(20):
+            B = r.randint(4, 400)
+            Mq = r.randint(4, 500)
+            Mkv = r.randint(4, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 4)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        #torch.Tensor,
+        #LowerTriangularMask,
+        #LowerTriangularMaskWithTensorBias,
+        ##BlockDiagonalMask,
+        ##BlockDiagonalCausalMask,
+        ##BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        ##BlockDiagonalCausalFromBottomRightMask,
+    }
+
+SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half}
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(
+            sorted(list(SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
+        )
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                ##for dtype in op.SUPPORTED_DTYPES:
+                for dtype in SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (4, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (4, 1 + 2**16, 4, 1, 8, 8),
+            (4, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+    return {
+        "argvalues": combination,
+        "ids": ids,
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+
+            # make sure it also works if the first columns are partially masked out
+            attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+
+@pytest.mark.parametrize("fmt", ["BMHK"])
+@pytest.mark.parametrize("packed", [False])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_forward(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed,
+    fmt,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    if packed and not (k == kv and q_len == kv_len):
+        pytest.skip(
+            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
+        )
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
+        pytest.skip("BMK incompatible with this bias")
+
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+    )
+
+    if packed:
+        c = torch.stack([query, key, value], 2)
+        if fmt == "BMK":
+            # bm3hk -> 3bhmk -> 3Bmk
+            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            query, key, value = c[0], c[1], c[2]
+            # Re-create bias in the right format
+            attn_bias = create_attn_bias(
+                bias_type=bias_type,
+                batch_size=batch_size,
+                num_heads=h,
+                q_len=q_len,
+                kv_len=kv_len,
+                device=device,
+                dtype=dtype,
+                requires_grad=False,
+                fmt=fmt,
+                op=op,
+            )
+        else:
+            # bm3hk -> 3 x bmhk
+            query, key, value = xformers.ops.unbind(c, 2)
+        assert not query.is_contiguous()
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+

From fc446a6f812117b62a2c06c045b712f4e26f10d7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 16 Aug 2023 16:43:25 +0000
Subject: [PATCH 018/641] Update to the tolerance value for bfloat16 in
 test_ck_4.py and ck.py

---
 tests/test_ck_4.py      | 20 ++++++++++----------
 xformers/ops/fmha/ck.py | 11 +++++++++++
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/tests/test_ck_4.py b/tests/test_ck_4.py
index ed58804c2..f04d4b328 100644
--- a/tests/test_ck_4.py
+++ b/tests/test_ck_4.py
@@ -78,17 +78,17 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
 
 
 SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        type(None),
-        #torch.Tensor,
-        #LowerTriangularMask,
-        #LowerTriangularMaskWithTensorBias,
+        ##type(None),
+        torch.Tensor,
+        ##LowerTriangularMask,
+        ##LowerTriangularMaskWithTensorBias,
         ##BlockDiagonalMask,
         ##BlockDiagonalCausalMask,
         ##BlockDiagonalCausalWithOffsetPaddedKeysMask,
         ##BlockDiagonalCausalFromBottomRightMask,
     }
 
-SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half}
+SUPPORTED_DTYPES: Set[torch.dtype] = {torch.bfloat16}
 
 def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
@@ -143,10 +143,10 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
         bias_type = type(None)
         for shape in (
             # Some strides/dims don't fit on an uint16
-            (4, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (4, 1 + 2**16, 4, 1, 8, 8),
-            (4, 4, 1 + 2**16, 1, 8, 8),
+            (4, 128, 128, 8, 128, 128),
+            (13, 1, 67, 16, 8, 8),
+            (4, 320, 4, 1, 8, 8),
+            (4, 4, 320, 1, 8, 8),
             # TODO: Some strides don't fit on an uint32
             # Crashes on Flash, Errors on Cutlass
             # (1, 1, 64000, 300, 128, 128)
@@ -576,6 +576,6 @@ def test_forward(
         out.float(),
         ref,
         atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+        rtol=op.ERROR_RTOL[dtype],
     )
 
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 9cac79d76..4bc21251d 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -157,6 +157,17 @@ class FwOp(AttentionFwOpBase):
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
     NAME = "ckF"
 
+    ERROR_ATOL: Mapping[torch.dtype, float] = {
+        torch.float: 3e-4,
+        torch.half: 4e-3,
+        torch.bfloat16: 2e-2,
+    }
+    ERROR_RTOL: Mapping[torch.dtype, float] = {
+        torch.float: 2e-5,
+        torch.half: 4e-4,
+        torch.bfloat16: 2e-2,
+    }
+
     _TEST_K: List[int] = [
         32,  # 64x64 kernel
         128,  # 64x128 kernel

From 19b626713ccec45a272d867d3633fd6cfe487966 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 17 Aug 2023 14:18:09 +0000
Subject: [PATCH 019/641] Update composable_kernel to latest commit

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index d20c472f8..e296ee56b 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit d20c472f8d5a00da0934e91f3ddc16f7dd3e3ecb
+Subproject commit e296ee56b35207af047ef3a5cb0f00788c9f2cf0

From 321445dd9581626a4a5e9193ddd78ac9828252a6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 20 Aug 2023 23:53:53 +0000
Subject: [PATCH 020/641] Fix in hip_fmha C++ codes to make 3 of 4
 BlockDiagonal attn_bias types passed for test_ck_3.py

---
 tests/test_ck_3.py                            |  5 +-
 .../hip_fmha/attention_forward_generic.cpp    | 49 +++++++++----------
 .../hip_fmha/ck_fmha_grouped_forward.h        |  3 ++
 3 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 9b790c743..21bd67586 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -491,12 +491,13 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
 '''
 
 @pytest.mark.parametrize("packed", [False, True])
-@pytest.mark.parametrize("fmt", ["BMHK"])
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 def test_forward(fmt, packed):
     op = fmha.ck.FwOp
     device = torch.device("cuda")
     dtype = torch.float16 
-    bias_type = fmha.attn_bias.LowerTriangularMask
+    ##bias_type = fmha.attn_bias.LowerTriangularMask
+    bias_type = fmha.attn_bias.BlockDiagonalCausalMask
     batch_size = 7 
     q_len = 1000
     kv_len = 1000
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 920ec43aa..785f275e0 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -218,6 +218,7 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      p.has_attn_bias = true;
       const at::Tensor bias_4d_view =
           get_bias_4d_view(*bias, B, num_heads, M, N);
       p.attn_bias_strides = {
@@ -225,7 +226,8 @@ efficient_attention_forward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
-    };
+    } else
+      p.has_attn_bias = false;
 
     p.custom_mask_type = custom_mask_type;
 
@@ -245,6 +247,7 @@ efficient_attention_forward_ck(
         seqstart_k->data_ptr(),
         (p.num_batches + 1) * sizeof(int32_t),
         hipMemcpyDeviceToHost));
+
     if (seqlen_k.has_value())
       FMHA_HIP_CHECK(hipMemcpy(
           p.host_seqlen_k.data(),
@@ -257,41 +260,33 @@ efficient_attention_forward_ck(
     char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
 
     char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
+    char* attn_bias_ptr =
+        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
 
     for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_q_stride = get_size_in_bytes(
+      int32_t tmp_q_offset = get_size_in_bytes(
           p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
-      int32_t tmp_k_stride = get_size_in_bytes(
+      int32_t tmp_k_offset = get_size_in_bytes(
           p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
-      int32_t tmp_v_stride = get_size_in_bytes(
+      int32_t tmp_v_offset = get_size_in_bytes(
           p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
-      int32_t tmp_o_stride = get_size_in_bytes(
+      int32_t tmp_o_offset = get_size_in_bytes(
           p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
 
-      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
-      q_ptr = q_ptr + tmp_q_stride;
-
-      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
-      k_ptr = k_ptr + tmp_k_stride;
-
-      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
-      v_ptr = v_ptr + tmp_k_stride;
-
-      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
-      out_ptr = out_ptr + tmp_o_stride;
+      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
+      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
+      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
+      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
 
       if (bias.has_value()) {
-        p.has_attn_bias = true;
-        int32_t tmp_bias_stride = get_size_in_bytes(
+        int32_t tmp_bias_offset = get_size_in_bytes(
             p.host_seqstart_q[i] * p.attn_bias_strides[2] +
                 p.host_seqstart_k[i] * p.attn_bias_strides[3],
             bias->scalar_type());
 
-        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
-        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
-      } else
-        p.has_attn_bias = false;
+        p.attn_bias_ptrs.push_back(
+            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+      };
     }
 
     p.use_dropout = use_dropout;
@@ -319,7 +314,8 @@ efficient_attention_forward_ck(
         p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
         randvals_ptr = randvals_ptr + tmp_randvals_stride;
       };
-    };
+    } else
+      p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty(
@@ -341,21 +337,20 @@ efficient_attention_forward_ck(
   int64_t seed, offset;
 
   DISPATCH_TYPES(query.scalar_type(), [&]() {
-    out = at::empty(
+    out = at::zeros(
         {B, M, num_heads, Kv},
         query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
 
     if (!seqstart_q.has_value()) { // input is batched
       BatchedForwardParams batched_forward_params;
 
-      std::cout << " -------- call batched_forward ---------" << std::endl; 
       set_batched_forward_params(batched_forward_params);
       batched_forward<scalar_t>(batched_forward_params, stream);
     } else { // input is grouped
       GroupedForwardParams grouped_forward_params;
 
-      std::cout << " -------- call grouped_forward ---------" << std::endl; 
       set_grouped_forward_params(grouped_forward_params);
+      std::cout << " -------- call grouped_forward ---------" << std::endl;
       grouped_forward<scalar_t>(grouped_forward_params, stream);
     }
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 97efabfe5..7ee73f54b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -184,6 +184,9 @@ void grouped_forward_masktype_attnbias_dispatched(
     int Kv = param.Kv;
     int G1 = param.num_heads;
 
+    std::cout << "M, N, G1, K, Kv: " << M << " " << N << " " << G1 << " " << K
+              << " " << Kv << std::endl;
+
     std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
     std::vector<ck::index_t> a_gs_ms_ks_strides{
         0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};

From 0f491fc0a2926fcaf241c1ce74739c59ad4ef263 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 21 Aug 2023 16:24:29 +0000
Subject: [PATCH 021/641] Update and make all 8 attn_bias types passed for
 test_ck_3.py

---
 tests/test_ck_3.py                            | 33 ++++++++++---------
 tests/test_ck_4.py                            | 12 +++----
 .../hip_fmha/attention_forward_generic.cpp    |  1 -
 .../hip_fmha/ck_fmha_grouped_forward.h        |  3 --
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 21bd67586..14834c0d1 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -5,7 +5,7 @@
 
 import math
 import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Set, Any
 
 import pytest
 import torch
@@ -478,29 +478,30 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
         (0, 2, 1, 3)
     )
 
-'''
+## The same set of supported attn_bias types as defined by ck.FwOp
 SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
         type(None),
         torch.Tensor,
-        LowerTriangularMask,
-        LowerTriangularMaskWithTensorBias,
-        BlockDiagonalMask,
-        BlockDiagonalCausalMask,
-        BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        attn_bias.BlockDiagonalCausalFromBottomRightMask,
-'''
+        fmha.attn_bias.LowerTriangularMask,
+        fmha.attn_bias.LowerTriangularMaskWithTensorBias,
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask }
 
+@pytest.mark.parametrize("bias_type", SUPPORTED_ATTN_BIAS_TYPES)
 @pytest.mark.parametrize("packed", [False, True])
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-def test_forward(fmt, packed):
+@pytest.mark.parametrize("fmt", ["BMHK"])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_forward(dtype, fmt, packed, bias_type):
     op = fmha.ck.FwOp
     device = torch.device("cuda")
-    dtype = torch.float16 
-    ##bias_type = fmha.attn_bias.LowerTriangularMask
-    bias_type = fmha.attn_bias.BlockDiagonalCausalMask
     batch_size = 7 
-    q_len = 1000
-    kv_len = 1000
+    q_len = 200
+    if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+       kv_len = int(q_len * 1.2) 
+    else:
+       kv_len = q_len
     h = 3
     k = 64
     kv = 64
diff --git a/tests/test_ck_4.py b/tests/test_ck_4.py
index f04d4b328..e008514bb 100644
--- a/tests/test_ck_4.py
+++ b/tests/test_ck_4.py
@@ -79,16 +79,16 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
 
 SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
         ##type(None),
-        torch.Tensor,
+        ##torch.Tensor,
         ##LowerTriangularMask,
-        ##LowerTriangularMaskWithTensorBias,
+        LowerTriangularMaskWithTensorBias,
         ##BlockDiagonalMask,
         ##BlockDiagonalCausalMask,
         ##BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        ##BlockDiagonalCausalFromBottomRightMask,
+        #3BlockDiagonalCausalFromBottomRightMask,
     }
 
-SUPPORTED_DTYPES: Set[torch.dtype] = {torch.bfloat16}
+SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half}
 
 def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
@@ -502,8 +502,8 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
     )
 
 
-@pytest.mark.parametrize("fmt", ["BMHK"])
-@pytest.mark.parametrize("packed", [False])
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 def test_forward(
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 785f275e0..2800029c6 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -350,7 +350,6 @@ efficient_attention_forward_ck(
       GroupedForwardParams grouped_forward_params;
 
       set_grouped_forward_params(grouped_forward_params);
-      std::cout << " -------- call grouped_forward ---------" << std::endl;
       grouped_forward<scalar_t>(grouped_forward_params, stream);
     }
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 7ee73f54b..97efabfe5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -184,9 +184,6 @@ void grouped_forward_masktype_attnbias_dispatched(
     int Kv = param.Kv;
     int G1 = param.num_heads;
 
-    std::cout << "M, N, G1, K, Kv: " << M << " " << N << " " << G1 << " " << K
-              << " " << Kv << std::endl;
-
     std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
     std::vector<ck::index_t> a_gs_ms_ks_strides{
         0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};

From c3b640cfebc5762ea3b033b84f18ce04fd84e952 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 21 Aug 2023 20:03:04 +0000
Subject: [PATCH 022/641] Updates to test_ck_3.py and test_ck_4.py

---
 tests/test_ck_3.py | 171 +++++----------------------------------------
 tests/test_ck_4.py |  26 +++----
 2 files changed, 30 insertions(+), 167 deletions(-)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 14834c0d1..92456452f 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -32,126 +32,6 @@
     "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
 )
 
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-
-    # Add some random shapes
-    if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        for _ in range(20):
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(
-            sorted(list(op.SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
-        )
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-    return {
-        "argvalues": combination,
-        "ids": ids,
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
     if q.ndim == 4:
         assert p == 0.0
@@ -244,15 +124,6 @@ def _rand_seqlens(
     return seqlens_q, seqlens_k
 
 
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
 def _rand_maxed_partition(
     r: random.Random, total: int, n: int, mx: int, positive: bool = True
 ) -> List[int]:
@@ -326,7 +197,7 @@ def create_attn_bias(
         if fmt == "BMK":
             batch_size *= num_heads
             num_heads = 1
-        # `small_k` only supports an expanded 1d bias
+        ##`small_k` only supports an expanded 1d bias
         if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
             attn_bias = (
                 torch.randn(
@@ -346,7 +217,7 @@ def create_attn_bias(
             )
 
             # make sure it also works if the first columns are partially masked out
-            attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
 
         if requires_grad:
             attn_bias.requires_grad_(True)
@@ -464,20 +335,6 @@ def create_tensors(
         pytest.skip(err_msg)
     return query, key, value, attn_bias
 
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
 ## The same set of supported attn_bias types as defined by ck.FwOp
 SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
         type(None),
@@ -487,23 +344,24 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
         fmha.attn_bias.BlockDiagonalMask,
         fmha.attn_bias.BlockDiagonalCausalMask,
         fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask }
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        }
 
 @pytest.mark.parametrize("bias_type", SUPPORTED_ATTN_BIAS_TYPES)
 @pytest.mark.parametrize("packed", [False, True])
-@pytest.mark.parametrize("fmt", ["BMHK"])
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 def test_forward(dtype, fmt, packed, bias_type):
     op = fmha.ck.FwOp
     device = torch.device("cuda")
-    batch_size = 7 
+    batch_size = 7
     q_len = 200
     if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-       kv_len = int(q_len * 1.2) 
+        kv_len = int(q_len * 1.2) 
     else:
-       kv_len = q_len
-    h = 3
-    k = 64
+        kv_len = q_len
+    h = 3 
+    k = 64 
     kv = 64
 
     if packed and not (k == kv and q_len == kv_len):
@@ -517,11 +375,16 @@ def test_forward(dtype, fmt, packed, bias_type):
         op, device, dtype, bias_type, batch_size, q_len, kv_len, h, k, kv, fmt="BMHK" if packed else fmt
     )
 
+    print("query shape: ", query.shape)
+    print("key shape: ", key.shape)
+    print("value shape: ", value.shape)
+
+    ## when packed, the query, key, value is in BMHK format
     if packed:
         c = torch.stack([query, key, value], 2)
         if fmt == "BMK":
             # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            c = c.permute(2, 0, 3, 1, 4).reshape([3, batch_size*h, q_len, k])
             query, key, value = c[0], c[1], c[2]
             # Re-create bias in the right format
             attn_bias = create_attn_bias(
@@ -539,7 +402,7 @@ def test_forward(dtype, fmt, packed, bias_type):
         else:
             # bm3hk -> 3 x bmhk
             query, key, value = xformers.ops.unbind(c, 2)
-        assert not query.is_contiguous()
+        ##assert not query.is_contiguous()
 
     out = xformers.ops.memory_efficient_attention_forward(
         query, key, value, attn_bias, op=op
diff --git a/tests/test_ck_4.py b/tests/test_ck_4.py
index e008514bb..24f4dbe5c 100644
--- a/tests/test_ck_4.py
+++ b/tests/test_ck_4.py
@@ -78,17 +78,17 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
 
 
 SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        ##type(None),
-        ##torch.Tensor,
-        ##LowerTriangularMask,
+        type(None),
+        torch.Tensor,
+        LowerTriangularMask,
         LowerTriangularMaskWithTensorBias,
-        ##BlockDiagonalMask,
-        ##BlockDiagonalCausalMask,
-        ##BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        #3BlockDiagonalCausalFromBottomRightMask,
+        BlockDiagonalMask,
+        BlockDiagonalCausalMask,
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        BlockDiagonalCausalFromBottomRightMask,
     }
 
-SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half}
+SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
 
 def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
@@ -143,8 +143,8 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
         bias_type = type(None)
         for shape in (
             # Some strides/dims don't fit on an uint16
-            (4, 128, 128, 8, 128, 128),
-            (13, 1, 67, 16, 8, 8),
+            (4, 128, 128, 4, 128, 128),
+            (13, 4, 67, 16, 8, 8),
             (4, 320, 4, 1, 8, 8),
             (4, 4, 320, 1, 8, 8),
             # TODO: Some strides don't fit on an uint32
@@ -369,7 +369,7 @@ def create_attn_bias(
             )
 
             # make sure it also works if the first columns are partially masked out
-            attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+            #attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
 
         if requires_grad:
             attn_bias.requires_grad_(True)
@@ -538,7 +538,7 @@ def test_forward(
         c = torch.stack([query, key, value], 2)
         if fmt == "BMK":
             # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            c = c.permute(2, 0, 3, 1, 4).reshape([3, -1, q_len, k])
             query, key, value = c[0], c[1], c[2]
             # Re-create bias in the right format
             attn_bias = create_attn_bias(
@@ -556,7 +556,7 @@ def test_forward(
         else:
             # bm3hk -> 3 x bmhk
             query, key, value = xformers.ops.unbind(c, 2)
-        assert not query.is_contiguous()
+        ##assert not query.is_contiguous()
 
     out = xformers.ops.memory_efficient_attention_forward(
         query, key, value, attn_bias, op=op

From d8133ca8f584ad4af6bf8efb71216180c36d973c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 21 Aug 2023 21:02:42 +0000
Subject: [PATCH 023/641] Add type checking in attention_forward_generic.cpp

---
 tests/test_ck_3.py                                         | 2 ++
 .../csrc/attention/hip_fmha/attention_forward_generic.cpp  | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 92456452f..0b5eed425 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -356,6 +356,8 @@ def test_forward(dtype, fmt, packed, bias_type):
     device = torch.device("cuda")
     batch_size = 7
     q_len = 200
+
+    ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
     if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
         kv_len = int(q_len * 1.2) 
     else:
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 2800029c6..54e4ce5d8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -58,6 +58,9 @@ efficient_attention_forward_ck(
   // Embedding per head
   TORCH_CHECK(query.size(3) == key.size(3));
 
+  TORCH_CHECK(query.scalar_type() == key.scalar_type());
+  TORCH_CHECK(query.scalar_type() == value.scalar_type());
+
   TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
   if (seqstart_q.has_value()) {
     TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
@@ -141,6 +144,8 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
       p.has_attn_bias = true;
       p.attn_bias_ptr = bias->data_ptr();
 
@@ -218,6 +223,8 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
       p.has_attn_bias = true;
       const at::Tensor bias_4d_view =
           get_bias_4d_view(*bias, B, num_heads, M, N);

From 2960ae7c4c5605d9ab53406e4dd1d9f7b85be442 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 22 Aug 2023 19:15:19 +0000
Subject: [PATCH 024/641] Use a different grouped ck-flashAttention device
 operator instance to prevent some failed cases

---
 tests/test_ck_3.py                                         | 3 +++
 tests/test_ck_4.py                                         | 4 +++-
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h | 6 +++---
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 0b5eed425..6c69f5fd6 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -366,6 +366,9 @@ def test_forward(dtype, fmt, packed, bias_type):
     k = 64 
     kv = 64
 
+    if kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
+
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
diff --git a/tests/test_ck_4.py b/tests/test_ck_4.py
index 24f4dbe5c..7358b36c6 100644
--- a/tests/test_ck_4.py
+++ b/tests/test_ck_4.py
@@ -501,7 +501,6 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
         (0, 2, 1, 3)
     )
 
-
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
@@ -523,6 +522,9 @@ def test_forward(
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
+    if kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
+
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 97efabfe5..b895d47f7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -129,7 +129,7 @@ void grouped_forward_masktype_attnbias_dispatched(
           128, // MPerBlock
           128, // NPerBlock
           32, // KPerBlock
-          64, // Gemm1NPerBlock
+          128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           8, // AK1
           8, // BK1
@@ -138,7 +138,7 @@ void grouped_forward_masktype_attnbias_dispatched(
           32, // NPerXDL
           1, // MXdlPerWave
           4, // NXdlPerWave
-          2, // Gemm1NXdlPerWave
+          4, // Gemm1NXdlPerWave
           1, // DropoutStep
           S<4, 64, 1>, // ABlockTransfer
           S<1, 0, 2>,
@@ -155,7 +155,7 @@ void grouped_forward_masktype_attnbias_dispatched(
           8,
           true,
           Acc0BiasTransferSrcScalarPerVector,
-          S<16, 16, 1>, // B1BlockTransfer
+          S<8, 32, 1>, // B1BlockTransfer
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,

From 0da9bf2b311949598be7639edd7c00fb7d9e75c4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 22 Aug 2023 20:19:58 +0000
Subject: [PATCH 025/641] Add checking for attn_bias and seqlen_k in
 attention_forward_generic.cpp

---
 .../hip_fmha/attention_forward_generic.cpp         | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 54e4ce5d8..24b9f6b3b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -144,6 +144,7 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
       p.has_attn_bias = true;
@@ -241,9 +242,6 @@ efficient_attention_forward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    if (seqlen_k.has_value())
-      p.host_seqlen_k.resize(p.num_batches);
-
     FMHA_HIP_CHECK(hipMemcpy(
         p.host_seqstart_q.data(),
         seqstart_q->data_ptr(),
@@ -255,12 +253,20 @@ efficient_attention_forward_ck(
         (p.num_batches + 1) * sizeof(int32_t),
         hipMemcpyDeviceToHost));
 
-    if (seqlen_k.has_value())
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+      CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqlen_k));
+
+      p.host_seqlen_k.resize(p.num_batches);
+
       FMHA_HIP_CHECK(hipMemcpy(
           p.host_seqlen_k.data(),
           seqlen_k->data_ptr(),
           p.num_batches * sizeof(int32_t),
           hipMemcpyDeviceToHost));
+    }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
     char* k_ptr = reinterpret_cast<char*>(key.data_ptr());

From 99da85c16752099074d4e13df56cd27b066f63dc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 22 Aug 2023 21:18:33 +0000
Subject: [PATCH 026/641] Split the C++ codes called by
 attention_forward_generic.cpp into 4 cpp files to speed-up the compiling

---
 .../hip_fmha/attention_forward_generic.cpp    | 31 ++++++++++++++---
 .../hip_fmha/ck_fmha_batched_forward.h        | 32 ------------------
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp | 28 ++++++++++++++++
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp | 28 ++++++++++++++++
 .../hip_fmha/ck_fmha_grouped_forward.h        | 32 ------------------
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp | 28 ++++++++++++++++
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp | 33 +++++++++++++++++++
 .../csrc/attention/hip_fmha/ck_fmha_util.h    |  1 +
 8 files changed, 145 insertions(+), 68 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 24b9f6b3b..652ef8092 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -11,10 +11,21 @@
 #include <c10/util/Optional.h>
 #include <torch/library.h>
 
-#include "ck_fmha_batched_forward.h"
-#include "ck_fmha_grouped_forward.h"
 #include "ck_fmha_util.h"
 
+extern void batched_forward_fp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void batched_forward_bp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_fp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_bp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+
 namespace {
 
 /*
@@ -358,12 +369,24 @@ efficient_attention_forward_ck(
       BatchedForwardParams batched_forward_params;
 
       set_batched_forward_params(batched_forward_params);
-      batched_forward<scalar_t>(batched_forward_params, stream);
+
+      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
+        batched_forward_fp16(batched_forward_params, stream);
+      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
+        batched_forward_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported");
     } else { // input is grouped
       GroupedForwardParams grouped_forward_params;
 
       set_grouped_forward_params(grouped_forward_params);
-      grouped_forward<scalar_t>(grouped_forward_params, stream);
+
+      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
+        grouped_forward_fp16(grouped_forward_params, stream);
+      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
+        grouped_forward_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported");
     }
   });
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 5cb94229d..e8ce9302a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -11,38 +11,6 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void batched_forward_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-
-template <typename scalar_t>
-void batched_forward(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<scalar_t, 0, true>(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<scalar_t, 0, false>(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<scalar_t, 1, true>(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<scalar_t, 1, false>(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<scalar_t, 2, true>(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<scalar_t, 2, false>(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
-};
-
 template <typename scalar_t, int32_t custom_mask_type = 0, bool has_attn_bias>
 void batched_forward_masktype_attnbias_dispatched(
     BatchedForwardParams& param,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
new file mode 100644
index 000000000..82f6373da
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
new file mode 100644
index 000000000..d502ea8a4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+          param, stream);
+    else
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index b895d47f7..91e16df74 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -12,38 +12,6 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void grouped_forward_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream);
-
-template <typename scalar_t>
-void grouped_forward(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<scalar_t, 0, true>(
-          param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<scalar_t, 0, false>(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<scalar_t, 1, true>(
-          param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<scalar_t, 1, false>(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<scalar_t, 2, true>(
-          param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<scalar_t, 2, false>(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
-};
-
 template <typename scalar_t, int32_t custom_mask_type = 0, bool has_attn_bias>
 void grouped_forward_masktype_attnbias_dispatched(
     GroupedForwardParams& param,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
new file mode 100644
index 000000000..9d0e48a28
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
new file mode 100644
index 000000000..578197f83
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -0,0 +1,33 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void grouped_forward_masktype_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+
+void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+          param, stream);
+    else
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 32e3d0a7e..0aed26cf9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -11,6 +11,7 @@
 #include <ck/ck.hpp>
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 
 // Here flag can be a constant, variable or function call
 #define FMHA_HIP_CHECK(ret_or_call)                                          \

From 3fc8e220798fb3af0ed55e4c770135b4552640dd Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 23 Aug 2023 10:19:03 +0000
Subject: [PATCH 027/641] Split the C++ codes called by
 attention_backward_generic.cpp into 4 cpp files to speed-up the compiling

---
 .../hip_fmha/attention_backward_generic.cpp   | 32 ++++++++++++++++---
 .../hip_fmha/ck_fmha_batched_backward.h       | 18 +----------
 .../ck_fmha_batched_backward_bp16.cpp         | 15 +++++++++
 .../ck_fmha_batched_backward_fp16.cpp         | 15 +++++++++
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp |  2 ++
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp |  2 ++
 .../hip_fmha/ck_fmha_grouped_backward.h       | 18 +----------
 .../ck_fmha_grouped_backward_bp16.cpp         | 15 +++++++++
 .../ck_fmha_grouped_backward_fp16.cpp         | 15 +++++++++
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp |  2 ++
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp |  7 ++--
 11 files changed, 98 insertions(+), 43 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index c4eb660de..1e73be6e9 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -9,11 +9,23 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
 
-#include "ck_fmha_batched_backward.h"
-#include "ck_fmha_grouped_backward.h"
 #include "ck_fmha_util.h"
 
+extern void batched_backward_fp16(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+extern void batched_backward_bp16(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+extern void grouped_backward_fp16(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
+extern void grouped_backward_bp16(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
+
 namespace {
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 efficient_attention_backward_ck(
     const at::Tensor& grad_out,
@@ -344,12 +356,24 @@ efficient_attention_backward_ck(
       BatchedBackwardParams batched_backward_params;
 
       set_batched_backward_params(batched_backward_params);
-      batched_backward<scalar_t>(batched_backward_params, stream);
+
+      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
+        batched_backward_fp16(batched_backward_params, stream);
+      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
+        batched_backward_bp16(batched_backward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported");
     } else { // input is grouped
       GroupedBackwardParams grouped_backward_params;
 
       set_grouped_backward_params(grouped_backward_params);
-      grouped_backward<scalar_t>(grouped_backward_params, stream);
+
+      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
+        grouped_backward_fp16(grouped_backward_params, stream);
+      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
+        grouped_backward_bp16(grouped_backward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported");
     }
   });
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index b267b8590..9ce99c264 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <numeric>
 #include <sstream>
 #include <stdexcept>
 
@@ -11,23 +12,6 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type>
-void batched_backward_mask_type_dispatched(
-    BatchedBackwardParams& param,
-    hipStream_t stream);
-
-template <typename scalar_t>
-void batched_backward(BatchedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    batched_backward_mask_type_dispatched<scalar_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    batched_backward_mask_type_dispatched<scalar_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    batched_backward_mask_type_dispatched<scalar_t, 2>(param, stream);
-  else
-    throw std::runtime_error("Invalid custom_mask_type value");
-};
-
 template <typename scalar_t, int32_t custom_mask_type = 0>
 void batched_backward_mask_type_dispatched(
     BatchedBackwardParams& param,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
new file mode 100644
index 000000000..69b1e5065
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -0,0 +1,15 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    batched_backward_mask_type_dispatched<ck::bhalf_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    batched_backward_mask_type_dispatched<ck::bhalf_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    batched_backward_mask_type_dispatched<ck::bhalf_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
new file mode 100644
index 000000000..273a2ee06
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -0,0 +1,15 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    batched_backward_mask_type_dispatched<ck::half_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    batched_backward_mask_type_dispatched<ck::half_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    batched_backward_mask_type_dispatched<ck::half_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 82f6373da..10bf8ee59 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -1,4 +1,6 @@
 #include <ck/ck.hpp>
+#include <stdexcept>
+
 #include "ck_fmha_batched_forward.h"
 
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index d502ea8a4..ea11d170a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -1,4 +1,6 @@
 #include <ck/ck.hpp>
+#include <stdexcept>
+
 #include "ck_fmha_batched_forward.h"
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 62ce0df01..eabbfa84a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <numeric>
 #include <sstream>
 #include <stdexcept>
 
@@ -13,23 +14,6 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type>
-void grouped_backward_mask_type_dispatched(
-    GroupedBackwardParams& param,
-    hipStream_t stream);
-
-template <typename scalar_t>
-void grouped_backward(GroupedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    grouped_backward_mask_type_dispatched<scalar_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    grouped_backward_mask_type_dispatched<scalar_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    grouped_backward_mask_type_dispatched<scalar_t, 2>(param, stream);
-  else
-    throw std::runtime_error("Invalid custom_mask_type value");
-};
-
 template <typename scalar_t, int32_t custom_mask_type = 0>
 void grouped_backward_mask_type_dispatched(
     GroupedBackwardParams& param,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
new file mode 100644
index 000000000..3c76d137d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -0,0 +1,15 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    grouped_backward_mask_type_dispatched<ck::bhalf_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    grouped_backward_mask_type_dispatched<ck::bhalf_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    grouped_backward_mask_type_dispatched<ck::bhalf_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
new file mode 100644
index 000000000..912023ca4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -0,0 +1,15 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0)
+    grouped_backward_mask_type_dispatched<ck::half_t, 0>(param, stream);
+  else if (param.custom_mask_type == 1)
+    grouped_backward_mask_type_dispatched<ck::half_t, 1>(param, stream);
+  else if (param.custom_mask_type == 2)
+    grouped_backward_mask_type_dispatched<ck::half_t, 2>(param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index 9d0e48a28..161818a39 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -1,4 +1,6 @@
 #include <ck/ck.hpp>
+#include <stdexcept>
+
 #include "ck_fmha_grouped_forward.h"
 
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index 578197f83..592bc89e4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -1,10 +1,7 @@
 #include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
+#include <stdexcept>
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void grouped_forward_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream);
+#include "ck_fmha_grouped_forward.h"
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {

From 5575ba034dd94b238e49e0d793c1f6344162d4bf Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 23 Aug 2023 17:55:11 +0000
Subject: [PATCH 028/641] Add comments for the commented code-line in
 create_attn_bias

---
 tests/test_ck_3.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 6c69f5fd6..3b4458dd8 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -216,6 +216,8 @@ def create_attn_bias(
                 dtype=dtype,
             )
 
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
             # make sure it also works if the first columns are partially masked out
             # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
 

From 478ec41206a8dcd7b611817bff57816ee56f17f0 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 23 Aug 2023 18:17:10 +0000
Subject: [PATCH 029/641] Update to composable_kernel to latest commit and
 remove un-needed including

---
 third_party/composable_kernel                               | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h | 1 -
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index e296ee56b..226355e7e 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit e296ee56b35207af047ef3a5cb0f00788c9f2cf0
+Subproject commit 226355e7e885881cdd904aec4df872fedb5447cd
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 9ce99c264..1b14c772f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <numeric>
 #include <sstream>
 #include <stdexcept>
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index eabbfa84a..bd86d7c32 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <numeric>
 #include <sstream>
 #include <stdexcept>
 

From 161a7d5095b258cd4ab2fe9b309eebdbfeaf8451 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 24 Aug 2023 16:43:02 +0000
Subject: [PATCH 030/641] Add test_mem_eff_attention_ck.py and
 tests/readme_test_on_rocm.txt

---
 tests/readme_test_on_rocm.txt      |    8 +
 tests/test_ck_1.py                 |   33 -
 tests/test_ck_2.py                 |  558 ---------
 tests/test_ck_3.py                 |  434 -------
 tests/test_ck_4.py                 |  583 ---------
 tests/test_mem_eff_attention_ck.py | 1783 ++++++++++++++++++++++++++++
 6 files changed, 1791 insertions(+), 1608 deletions(-)
 create mode 100644 tests/readme_test_on_rocm.txt
 delete mode 100644 tests/test_ck_1.py
 delete mode 100644 tests/test_ck_2.py
 delete mode 100644 tests/test_ck_3.py
 delete mode 100644 tests/test_ck_4.py
 create mode 100644 tests/test_mem_eff_attention_ck.py

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
new file mode 100644
index 000000000..5b5ce25aa
--- /dev/null
+++ b/tests/readme_test_on_rocm.txt
@@ -0,0 +1,8 @@
+
+   1. pip install -e ./
+
+   2. verify testing for memory_efficient_attention inference 
+
+      pytest -k test_forward tests/test_mem_eff_attention_ck.py
+
+
diff --git a/tests/test_ck_1.py b/tests/test_ck_1.py
deleted file mode 100644
index b5dba2d21..000000000
--- a/tests/test_ck_1.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-
-import pytest
-import torch
-
-from xformers.ops.common import get_xformers_operator
-
-B = 7
-M = 1000
-N = 1000
-H = 13
-K = 64
-Kv = 64
-
-_types = [torch.float16, torch.bfloat16]
-
-@pytest.mark.parametrize("test_type", _types)
-def test_types(test_type):
-    query = torch.rand((B, M, H, K), device=torch.device("cuda"), dtype=test_type)
-    key = torch.rand((B, N, H, K), device=torch.device("cuda"), dtype=test_type)
-    val = torch.rand((B, N, H, Kv), device=torch.device("cuda"), dtype=test_type)
-
-    Operator=get_xformers_operator("efficient_attention_forward_ck")
-
-    out, lse, rng_seed, rng_offset = Operator(query=query, key=key, value=val, attn_bias=None, seqstart_q=None, seqstart_k=None, dropout_p=0.0, compute_logsumexp=False, custom_mask_type=0, scale=None, seqlen_k=None)
-
-    print(rng_seed)
-
diff --git a/tests/test_ck_2.py b/tests/test_ck_2.py
deleted file mode 100644
index 5382ba5bf..000000000
--- a/tests/test_ck_2.py
+++ /dev/null
@@ -1,558 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
-
-import pytest
-import torch
-
-## need to FIX
-##from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
-
-import xformers.ops
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-
-from .utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cuda"] if torch.cuda.is_available() else ["cpu"]
-_types = [torch.float16, torch.bfloat16]
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
-    fmha.ck.BwOp,
-]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    for B in op._TEST_BATCH_SIZES:
-        for Mq in [32, 256]:
-            for Mkv in [32, 64, 256]:
-                for K in op._TEST_K:
-                    shapes.append((B, Mq, Mkv, 1, K, K))
-        Mq = 256
-        Mkv = 128
-        K = 32
-        H = 1
-        # Weird values of parameters
-        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
-            shapes.append((B, M, Mkv, H, K, K))
-            shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 256 + 2, 256 + 8, 512]:
-            if _K <= op.SUPPORTED_MAX_K:
-                shapes.append((B, Mq, Mkv, H, _K, _K))
-        # Different value for K / Kv
-        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-            for _K in [32, 36, 64, 256 + 8]:
-                shapes.append((B, Mq, Mkv, H, K, _K))
-                shapes.append((B, Mq, Mkv, H, _K, K))
-        # Exotic sizes
-        for K in op._TEST_K:
-            shapes.append((B, 16, 1024, H, K, K))
-            shapes.append((B, 1024, 16, H, K, K))
-        # Some number of heads
-        for H in [3, 5, 12]:
-            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
-    # Add some random shapes
-    if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        for _ in range(20):
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(
-            sorted(list(op.SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
-        )
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-    return {
-        "argvalues": combination,
-        "ids": ids,
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
-)
-
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-
-            # make sure it also works if the first columns are partially masked out
-            attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-'''
-def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
-    tensor_with_grad: Optional[torch.Tensor] = None
-    if isinstance(attn_bias, torch.Tensor):
-        tensor_with_grad = attn_bias
-    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-        tensor_with_grad = attn_bias._bias
-    if tensor_with_grad is not None:
-        grad = tensor_with_grad.grad
-        if clear:
-            tensor_with_grad.grad = None
-        return grad
-    return None
-'''
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
-@pytest.mark.parametrize("k_len", [32, 64])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("kv_len", [128, 512])
-@pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("device", _devices)
-@pytest.mark.parametrize("test_type", _types)
-def test_key_query_all_ones(test_type, device, q_len, kv_len, batch_size, k_len):
-    scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=test_type)
-    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=test_type)
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=test_type) * scale
-
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
-    # this should be equivalent to the average over value
-    ref = value.mean(1, keepdim=True).expand_as(query)
-
-    if test_type is torch.float16:
-        assert_allclose(out, ref, atol=1e-5)
-    else:
-        assert_allclose(out, ref, atol=1e-2)
-
-
diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
deleted file mode 100644
index 3b4458dd8..000000000
--- a/tests/test_ck_3.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Set, Any
-
-import pytest
-import torch
-
-## need to FIX
-##from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
-
-import xformers.ops
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-
-from tests.utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        ##`small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-## The same set of supported attn_bias types as defined by ck.FwOp
-SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        type(None),
-        torch.Tensor,
-        fmha.attn_bias.LowerTriangularMask,
-        fmha.attn_bias.LowerTriangularMaskWithTensorBias,
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        }
-
-@pytest.mark.parametrize("bias_type", SUPPORTED_ATTN_BIAS_TYPES)
-@pytest.mark.parametrize("packed", [False, True])
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
-def test_forward(dtype, fmt, packed, bias_type):
-    op = fmha.ck.FwOp
-    device = torch.device("cuda")
-    batch_size = 7
-    q_len = 200
-
-    ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
-    if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-        kv_len = int(q_len * 1.2) 
-    else:
-        kv_len = q_len
-    h = 3 
-    k = 64 
-    kv = 64
-
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    query, key, value, attn_bias = create_tensors(
-        op, device, dtype, bias_type, batch_size, q_len, kv_len, h, k, kv, fmt="BMHK" if packed else fmt
-    )
-
-    print("query shape: ", query.shape)
-    print("key shape: ", key.shape)
-    print("value shape: ", value.shape)
-
-    ## when packed, the query, key, value is in BMHK format
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).reshape([3, batch_size*h, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        else:
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-        ##assert not query.is_contiguous()
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
diff --git a/tests/test_ck_4.py b/tests/test_ck_4.py
deleted file mode 100644
index 7358b36c6..000000000
--- a/tests/test_ck_4.py
+++ /dev/null
@@ -1,583 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Any, Set
-
-import pytest
-import torch
-
-## need to FIX
-##from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
-
-import xformers.ops
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-
-from .utils import assert_allclose
-
-from xformers.ops.fmha.attn_bias import (
-    AttentionBias,
-    BlockDiagonalCausalMask,
-    BlockDiagonalCausalWithOffsetPaddedKeysMask,
-    BlockDiagonalMask,
-    BlockDiagonalCausalFromBottomRightMask,
-    LowerTriangularMask,
-    LowerTriangularMaskWithTensorBias,
-)
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cuda"] if torch.cuda.is_available() else ["cpu"]
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    # Add some random shapes
-    if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        for _ in range(20):
-            B = r.randint(4, 400)
-            Mq = r.randint(4, 500)
-            Mkv = r.randint(4, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 4)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        type(None),
-        torch.Tensor,
-        LowerTriangularMask,
-        LowerTriangularMaskWithTensorBias,
-        BlockDiagonalMask,
-        BlockDiagonalCausalMask,
-        BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        BlockDiagonalCausalFromBottomRightMask,
-    }
-
-SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(
-            sorted(list(SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
-        )
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                ##for dtype in op.SUPPORTED_DTYPES:
-                for dtype in SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (4, 128, 128, 4, 128, 128),
-            (13, 4, 67, 16, 8, 8),
-            (4, 320, 4, 1, 8, 8),
-            (4, 4, 320, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-    return {
-        "argvalues": combination,
-        "ids": ids,
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-
-            # make sure it also works if the first columns are partially masked out
-            #attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("packed", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed,
-    fmt,
-):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
-    )
-
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).reshape([3, -1, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        else:
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-        ##assert not query.is_contiguous()
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL[dtype],
-    )
-
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
new file mode 100644
index 000000000..bd083cdb8
--- /dev/null
+++ b/tests/test_mem_eff_attention_ck.py
@@ -0,0 +1,1783 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+##from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+compute_capability = (0, 0)
+if torch.cuda.is_available():
+    compute_capability = torch.cuda.get_device_capability("cuda")
+sm75_or_better_only = pytest.mark.skipif(
+    compute_capability < (7, 5), reason="requires sm75+"
+)
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
+    fmha.ck.BwOp,
+]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+
+def _filter_unsupported_ops(ops: Sequence[T]) -> Sequence[T]:
+    return [
+        op
+        for op in ops
+        if (
+            "cpu" in op.SUPPORTED_DEVICES
+            or op.CUDA_MINIMUM_COMPUTE_CAPABILITY <= compute_capability
+        )
+        and op.is_available()
+    ]
+
+
+ALL_FW_OPS = _filter_unsupported_ops(ALL_FW_OPS)
+ALL_BW_OPS = _filter_unsupported_ops(ALL_BW_OPS)
+
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    for B in op._TEST_BATCH_SIZES:
+        for Mq in [32, 256]:
+            for Mkv in [32, 64, 256]:
+                for K in op._TEST_K:
+                    shapes.append((B, Mq, Mkv, 1, K, K))
+        Mq = 256
+        Mkv = 128
+        K = 32
+        H = 1
+        # Weird values of parameters
+        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
+            shapes.append((B, M, Mkv, H, K, K))
+            shapes.append((B, Mq, M, H, K, K))
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 256 + 2, 256 + 8, 512]:
+            if _K <= op.SUPPORTED_MAX_K:
+                shapes.append((B, Mq, Mkv, H, _K, _K))
+        # Different value for K / Kv
+        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+            for _K in [32, 36, 64, 256 + 8]:
+                shapes.append((B, Mq, Mkv, H, K, _K))
+                shapes.append((B, Mq, Mkv, H, _K, K))
+        # Exotic sizes
+        for K in op._TEST_K:
+            shapes.append((B, 16, 1024, H, K, K))
+            shapes.append((B, 1024, 16, H, K, K))
+        # Some number of heads
+        for H in [3, 5, 12]:
+            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Add some random shapes
+    if op in [
+        fmha.cutlass.FwOp,
+        fmha.cutlass.BwOp,
+        fmha.flash.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        for _ in range(20):
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(
+            sorted(list(op.SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
+        )
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+    return {
+        "argvalues": combination,
+        "ids": ids,
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
+)
+
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
+            # make sure it also works if the first columns are partially masked out
+            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
+    tensor_with_grad: Optional[torch.Tensor] = None
+    if isinstance(attn_bias, torch.Tensor):
+        tensor_with_grad = attn_bias
+    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+        tensor_with_grad = attn_bias._bias
+    if tensor_with_grad is not None:
+        grad = tensor_with_grad.grad
+        if clear:
+            tensor_with_grad.grad = None
+        return grad
+    return None
+
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("packed", [False, True])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_forward(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed,
+    fmt,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+   
+    if kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
+
+    if packed and not (k == kv and q_len == kv_len):
+        pytest.skip(
+            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
+        )
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
+        pytest.skip("BMK incompatible with this bias")
+
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+    )
+
+    if packed:
+        c = torch.stack([query, key, value], 2)
+        if fmt == "BMK":
+            # bm3hk -> 3bhmk -> 3Bmk
+            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            query, key, value = c[0], c[1], c[2]
+            # Re-create bias in the right format
+            attn_bias = create_attn_bias(
+                bias_type=bias_type,
+                batch_size=batch_size,
+                num_heads=h,
+                q_len=q_len,
+                kv_len=kv_len,
+                device=device,
+                dtype=dtype,
+                requires_grad=False,
+                fmt=fmt,
+                op=op,
+            )
+        else:
+            # bm3hk -> 3 x bmhk
+            query, key, value = xformers.ops.unbind(c, 2)
+        assert not query.is_contiguous()
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    if dtype is torch.bfloat16:
+        assert_allclose(
+             out.float(),
+             ref,
+             atol=2.5e-2,
+             rtol=1e-2,
+        )
+    else:
+        assert_allclose(
+             out.float(),
+             ref,
+             atol=op.ERROR_ATOL[dtype],
+             rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+        )
+
+
+@pytest.mark.parametrize("k_len", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("kv_len", [128, 512])
+@pytest.mark.parametrize("q_len", [128, 512])
+@pytest.mark.parametrize("device", _devices)
+def test_key_query_all_ones(device, q_len, kv_len, batch_size, k_len):
+    scale = 3
+    query = torch.ones((batch_size, q_len, k_len), device=device)
+    key = torch.ones((batch_size, kv_len, k_len), device=device)
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    out = xformers.ops.memory_efficient_attention(query, key, value)
+    # this should be equivalent to the average over value
+    ref = value.mean(1, keepdim=True).expand_as(query)
+
+    assert_allclose(out, ref, atol=1e-5)
+
+
+def _block_diag_reshape_lse(
+    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
+) -> torch.Tensor:
+    """LSE can be padded, let's remove the padding"""
+    parts = []
+    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
+        parts.append(slice[:, : end - start])
+    return torch.cat(parts, dim=1).unsqueeze(1)
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+
+    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+        attn_bias=attn_bias,
+    )
+    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            tensor_bias = attn_bias.materialize(
+                (query.shape[0], 1, query.shape[1], key.shape[1]),
+                device=query.device,
+                dtype=torch.float32,
+            )
+        else:
+            assert isinstance(attn_bias, torch.Tensor)
+            tensor_bias = attn_bias
+        if tensor_bias.ndim == 4:
+            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
+        attn = attn + tensor_bias.float()
+    ref_lse = attn.logsumexp(-1)
+    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
+        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
+    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("grad_out_contiguous", [False, True])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_backward(
+    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    grad_out_contiguous,
+    fmt,
+):
+    (
+        op_bw,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    attn_bias_requires_grad = (
+        random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
+    )
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        attn_bias_requires_grad=attn_bias_requires_grad,
+        fmt=fmt,
+    )
+    op_fw = (
+        sample_random_supported_fw(
+            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
+            seed=q_len * kv + kv_len * k,
+        )
+        if op_bw != fmha.cutlass.BwOp
+        else fmha.cutlass.FwOp
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
+        pytest.skip("inputs not supported")
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, op=(op_fw, op_bw)
+    )
+
+    grad_out = torch.ones_like(out)
+    if grad_out_contiguous is False:
+        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
+            None, None, :
+        ].expand_as(out)
+
+    out.backward(grad_out)
+
+    if qkv is None and op_bw == fmha.cutlass.BwOp:
+        assert query.stride() == query.grad.stride()
+
+    grads = []
+    if qkv is None:
+        grads = [query.grad, key.grad, value.grad]
+        query.grad = None
+        key.grad = None
+        value.grad = None
+    else:
+        grads = [qkv.grad]
+        qkv.grad = None
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
+        if attn_bias_grad is not None:
+            grads.append(attn_bias_grad)
+
+    ref = ref_attention(query, key, value, attn_bias)
+    ref.backward(grad_out)
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        "fw pass",
+        atol=op_fw.ERROR_ATOL[dtype],
+        rtol=op_fw.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+    del out
+    del grad_out
+    del ref
+
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+
+    grads_ref = []
+    grads_name = []
+    if qkv is None:
+        assert isinstance(query.grad, torch.Tensor)
+        assert isinstance(key.grad, torch.Tensor)
+        assert isinstance(value.grad, torch.Tensor)
+        grads_ref = [query.grad, key.grad, value.grad]
+        grads_name = ["query", "key", "value"]
+    else:
+        assert isinstance(qkv.grad, torch.Tensor)
+        grads_ref = [qkv.grad]
+        grads_name = ["qkv"]
+
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias)
+        if attn_bias_grad is not None:
+            grads_ref.append(attn_bias.grad)
+            grads_name.append("bias")
+
+    del query
+    del key
+    del value
+    del qkv
+
+    assert len(grads_ref) == len(
+        grads
+    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
+    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
+        assert_allclose(
+            calc_grad,
+            ref_grad,
+            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+def _vec_binom_test(x, n, p):
+    """
+    vectorized implementation of scipy.stats.binom_test
+    this makes our tests much faster
+    reference: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_morestats.py#L2609-L2702
+    """
+    import numpy as np
+    from scipy.stats import distributions
+
+    x = np.atleast_1d(x)
+    d = distributions.binom.pmf(x, n, p)[:, None]
+    rerr = 1 + 1e-7
+    # x < p * n case
+    i = np.arange(np.ceil(p * n), n + 1)
+    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
+    pval1 = distributions.binom.cdf(x, n, p) + distributions.binom.sf(n - y, n, p)
+
+    # other case
+    i = np.arange(np.floor(p * n) + 1)
+    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
+    pval2 = distributions.binom.cdf(y - 1, n, p) + distributions.binom.sf(x - 1, n, p)
+
+    pval = np.where(x < p * n, pval1, pval2)
+    pval = np.minimum(1.0, pval)
+    return pval
+
+
+def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
+    if op == fmha.cutlass.FwOp:
+        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
+        rand_uniform = torch.ops.xformers._cutlass_rand_uniform(p, mask)
+        mask = (rand_uniform > p).to(torch.float32)
+        mask = mask.reshape(batch_size, q_len, kv_len)
+    else:
+        mask = torch.empty((batch_size, q_len, kv_len), device=device)
+        mask = torch.ops.xformers._temp_dropout(mask, p)
+
+    return mask
+
+
+### disable this test due to the un-availability of binomtest
+'''
+@cuda_only
+@pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
+@pytest.mark.parametrize("seed", [42, 124])
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k_len", [32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
+@pytest.mark.parametrize("q_len", [2, 33])
+@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
+def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
+    device = "cuda"
+    scale = 3
+    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
+    if not op.supports(inputs_for_support_check):
+        del query, key, value, attn_bias
+        pytest.skip(f"{op.NAME}: unsupported input")
+
+    torch.manual_seed(seed)
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, p, op=(op, None)
+    )
+
+    torch.manual_seed(seed)
+    out2 = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, p, op=(op, None)
+    )
+
+    assert_allclose(out, out2, "dropout reproducibility")
+
+    torch.manual_seed(seed)
+    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+    ref = ref_attention(query, key, value, attn_bias, mask, p)
+    assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
+
+    num_trials = 1000
+    p_val_tol = 1e-6
+    keep_prob = 1 - p
+    masks = []
+    for i in range(num_trials):
+        mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+        masks.append(mask.clone().cpu())
+    masks = torch.stack(masks, dim=0)
+    p_value = binomtest(int(masks.sum()), masks.numel(), p=keep_prob).pvalue
+    assert p_value > p_val_tol, p_value
+    masks = masks.sum(0).flatten()
+    p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
+    assert all(p_values > p_val_tol)
+'''
+
+def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
+    if dtype is torch.bfloat16 and compute_capability < (8, 0):
+        pytest.skip("bf16 requires Sm80")
+    if not op.is_available():
+        pytest.skip()
+
+    scale = 3
+    device = "cuda"
+    query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    grad_out = torch.ones_like(query)
+
+    assert op.supports(fmha.Inputs(query=query, key=key, value=value, p=p))
+
+    seed = 42
+    torch.manual_seed(seed)
+    out = xformers.ops.memory_efficient_attention(query, key, value, p=p, op=(op, None))
+
+    out.backward(grad_out)
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    torch.manual_seed(seed)
+    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+
+    ref = ref_attention(query, key, value, None, mask, p)
+    ref.backward(grad_out)
+
+    atol, rtol = (
+        fmha.AttentionBwOpBase.ERROR_ATOL[dtype],
+        fmha.AttentionBwOpBase.ERROR_RTOL[dtype],
+    )
+    assert_allclose(
+        grad_v,
+        value.grad,
+        "grad_v",
+        atol=atol,
+        rtol=rtol,
+    )
+    # TODO: Investigate why precision is worse
+    if dtype in [torch.float16, torch.bfloat16]:
+        atol = atol * 2 + 0.15
+        rtol = rtol * 2
+    assert_allclose(
+        grad_q,
+        query.grad,
+        "grad_q",
+        atol=atol,
+        rtol=rtol,
+    )
+    assert_allclose(
+        grad_k,
+        key.grad,
+        "grad_k",
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
+@pytest.mark.parametrize("q_len", [2, 33])
+def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
+@pytest.mark.parametrize("k", [16, 128, 256])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 248, 256])
+@pytest.mark.parametrize("q_len", [3, 248, 256])
+@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
+def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len,
+        kv_len,
+        batch_size,
+        k,
+        p,
+        op=fmha.cutlass.FwOp,
+        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
+    )
+
+
+@pytest.mark.parametrize("k_len", [32])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("kv_len", [3 * 32])
+@pytest.mark.parametrize("q_len", [3 * 32])
+@pytest.mark.parametrize("device", _devices)
+def test_memory_efficient_attention_full_block_masked(
+    device, q_len, kv_len, batch_size, k_len
+):
+    op_fw = fmha.small_k.FwOp
+    op_bw = fmha.small_k.BwOp
+
+    scale = 3
+    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    # in this case, most of the blocks in a row get masked
+    attn_bias = torch.full((3, 32), float("-inf"), device=device)
+    attn_bias[:2, :4] = 0
+    attn_bias = attn_bias.flatten()[None, None, :].expand(1, q_len, -1)
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, op=(op_fw, op_bw)
+    )
+    ref = ref_attention(query, key, value, attn_bias)
+
+    assert_allclose(
+        out, ref, atol=op_fw.ERROR_ATOL[query.dtype], rtol=op_fw.ERROR_RTOL[query.dtype]
+    )
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    grad_out = torch.ones_like(query)
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, attn_bias)
+    out.backward(grad_out)
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    ref = ref_attention(query, key, value, attn_bias)
+    ref.backward(grad_out)
+
+    atol = op_bw.ERROR_ATOL[query.dtype]
+    rtol = op_bw.ERROR_RTOL[query.dtype]
+    assert_allclose(grad_q, query.grad, "grad_q", atol=atol, rtol=rtol)
+    assert_allclose(grad_k, key.grad, "grad_k", atol=atol, rtol=rtol)
+    assert_allclose(grad_v, value.grad, "grad_v", atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt=fmt
+    )
+    grad_out = torch.ones_like(query)
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query, key, value, attn_bias
+    )
+    assert out.ndim == query.ndim
+    dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
+        grad_out, out, lse, query, key, value, attn_bias
+    )
+    assert dq.shape == query.shape
+    assert dk.shape == key.shape
+    assert dv.shape == value.shape
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_cuda_streams(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    if device != "cuda":
+        pytest.skip("Not CUDA")
+    bias_type = None
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ]
+    s_hipri = torch.cuda.Stream(priority=-1)
+    s_lopri = torch.cuda.Stream(priority=0)
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK"
+    )
+    torch.cuda.synchronize()
+    with torch.cuda.stream(s_lopri):
+        torch.cuda._sleep(100_000_000)  # wait 100m cycles
+        query *= 2
+    s_hipri.wait_stream(s_lopri)
+    with torch.cuda.stream(s_hipri):
+        # If the kernel is scheduled in the main stream
+        # `query * 2` has not been executed yet
+        out = xformers.ops.memory_efficient_attention(query, key, value, op=(op, None))
+    # Test that `s_lopri` is still sleeping
+    # and that `query *= 2` has not been executed yet
+    query2_main_stream = query * 2
+    torch.cuda.synchronize()
+    # TODO: Figure out why this is failing sometimes
+    # The sleep timer seems to be high enough already ...
+    # assert torch.allclose(query2_main_stream, query), "Need to increase sleep time"
+    del query2_main_stream
+
+    ref = ref_attention(query, key, value)
+    assert out.shape == ref.shape, out.shape
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    p = 0.0
+    scale = 1.0
+
+    (
+        op_bw,
+        device,
+        dtype,
+        _,
+        _,
+        q_len,
+        kv_len,
+        _,
+        k,
+        _,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    torch.manual_seed(q_len + kv_len + k)
+    if device != "cuda":
+        pytest.skip("Not CUDA")
+
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+    inputs = fmha.Inputs(
+        query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
+    )
+    op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
+    grad_out = torch.ones_like(query)
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    reasons = op_fw.not_supported_reasons(inputs)
+    if reasons:
+        pytest.skip(f"{op_fw.NAME}: unsupported ({'/'.join(reasons)})")
+    reasons = op_bw.not_supported_reasons(inputs)
+    if reasons:
+        pytest.skip(f"{op_bw.NAME}: unsupported ({'/'.join(reasons)})")
+
+    # NOTE: we still need to scale the inputs to not blowup
+    # the pre-softmax values (numerical stability)
+    s = k**-0.5
+    out = xformers.ops.memory_efficient_attention(
+        query * s, key, value, attn_bias, p, scale, op=(op_fw, op_bw)
+    )
+    out.backward(grad_out)
+    grad_q, grad_k, grad_v = query.grad, key.grad, value.grad
+    query.grad = key.grad = value.grad = None
+
+    ref = ref_attention(query * s, key, value, attn_bias, None, p, scale)
+    ref.backward(grad_out)
+    ref_grad_q, ref_grad_k, ref_grad_v = query.grad, key.grad, value.grad
+    query.grad = key.grad = value.grad = None
+
+    atol = op_fw.ERROR_ATOL[dtype]
+    rtol = op_fw.ERROR_RTOL[dtype]
+    assert_allclose(out.float(), ref.float(), "out", atol=atol, rtol=rtol)
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+    assert_allclose(grad_q, ref_grad_q, "grad_q", atol=atol, rtol=rtol)
+    assert_allclose(grad_k, ref_grad_k, "grad_k", atol=atol, rtol=rtol)
+    assert_allclose(grad_v, ref_grad_v, "grad_v", atol=atol, rtol=rtol)
+
+
+def apply_attention(query, key, value, attn_bias, op_fw, proj):
+    x = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias=attn_bias, op=(op_fw, None)
+    )
+    x = proj(x)
+    return x
+
+
+@pytest.mark.parametrize("use_reentrant", [False, True])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_grad_checkpointing(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    use_reentrant,
+):
+    fmt = "BMHK"
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    bias_type = None
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    )
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt=fmt,
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    proj = torch.nn.Linear(kv, k, device=device, dtype=dtype)
+
+    x = query
+    for _ in range(5):
+        x = checkpoint(
+            apply_attention,
+            x,
+            key,
+            value,
+            attn_bias,
+            op,
+            proj,
+            use_reentrant=use_reentrant,
+        )
+    x.mean().backward()
+
+
+ALL_FW_OPS_NO_SMALLK = [op for op in ALL_FW_OPS if op is not fmha.small_k.FwOp]
+
+
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 1, 1, 32])
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
+        0, 1, 3, 2
+    )
+    try:
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+    except ValueError as e:
+        if "Only work on pre-MLIR triton for now" in str(e):
+            pytest.skip("Only work on pre-MLIR triton for now")
+        q = q.contiguous()
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 2, 2, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
+    try:
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+    except ValueError as e:
+        if "Only work on pre-MLIR triton for now" in str(e):
+            pytest.skip("Only work on pre-MLIR triton for now")
+        q = q.contiguous()
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@sm75_or_better_only
+def test_unsupported_dropout_combine_flash_cutlass() -> None:
+    q = torch.empty(
+        [1, 4, 1, 16], device="cuda", dtype=torch.float16, requires_grad=True
+    )
+    with pytest.raises(ValueError):
+        out = fmha.memory_efficient_attention(
+            q, q, q, p=0.1, op=(fmha.cutlass.FwOp, fmha.flash.BwOp)
+        )
+        out.backward(out)
+    with pytest.raises(ValueError):
+        out = fmha.memory_efficient_attention(
+            q, q, q, p=0.1, op=(fmha.flash.FwOp, fmha.cutlass.BwOp)
+        )
+        out.backward(out)
+
+
+def test_attn_bias_causal() -> None:
+    m = -math.inf
+    causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
+    tensor_bias = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
+
+    attn_bias = fmha.attn_bias.LowerTriangularMask()
+    assert_allclose(attn_bias.materialize(causal_mask.shape), causal_mask, "causal")
+    attn_bias = attn_bias.add_bias(tensor_bias)
+    assert_allclose(
+        attn_bias.materialize(causal_mask.shape),
+        tensor_bias + causal_mask,
+        "causal+tensor_bias",
+    )
+
+
+def test_attn_bias_torch_tensor() -> None:
+    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
+    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
+    m = -math.inf
+    causal_bias = torch.tensor([[0, m, m], [0, 0, m]])
+    assert_allclose(
+        attn_bias.materialize((2, 3)), causal_bias + tensor_bias, "tensor_bias+causal"
+    )
+
+
+def test_attn_bias_blockdiag() -> None:
+    queries = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([1, 5, 1, 8]),
+    ]
+    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((10, 10))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 2 * 2 + 5 * 5
+    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
+    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1")
+    assert_allclose(as_tensor[5:, 5:], torch.zeros([5, 5]), "batch2")
+
+    # Verify we can split it back
+    queries2 = attn_bias.split(q)
+    assert len(queries) == len(queries2)
+    for q1, q2 in zip(queries, queries2):
+        assert_allclose(q1, q2)
+
+
+def test_attn_bias_blockdiag_batched() -> None:
+    queries = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([3, 2, 1, 8]),
+        torch.randn([1, 5, 1, 8]),
+    ]
+    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((14, 14))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 3 * 2 * 2 + 5 * 5
+    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
+    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1.0")
+    assert_allclose(as_tensor[5:7, 5:7], torch.zeros([2, 2]), "batch1.1")
+    assert_allclose(as_tensor[7:9, 7:9], torch.zeros([2, 2]), "batch1.2")
+    assert_allclose(as_tensor[9:, 9:], torch.zeros([5, 5]), "batch2")
+
+    # Verify we can split it back
+    queries2 = attn_bias.split(q)
+    assert len(queries) == len(queries2)
+    for q1, q2 in zip(queries, queries2):
+        assert_allclose(q1, q2)
+
+
+def test_attn_bias_blockdiag_crossattn_causal() -> None:
+    # Q / KV have different seqlen
+    list_q = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([2, 1, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 3, 1, 8]),
+    ]
+
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((q.shape[1], k.shape[1]))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 2 + 2 * 3 * 1
+    assert_allclose(as_tensor[0:3, 0:2], torch.zeros([3, 2]), "batch0")
+    assert_allclose(as_tensor[3:4, 2:5], torch.zeros([1, 3]), "batch1.0")
+    assert_allclose(as_tensor[4:, 5:], torch.zeros([1, 3]), "batch1.1")
+
+    # Also test causal version
+    as_tensor = attn_bias.make_causal().materialize((q.shape[1], k.shape[1]))
+    assert_allclose(
+        as_tensor[3:4, 2:5],
+        fmha.attn_bias.LowerTriangularMask().materialize((1, 3)),
+        "batch1.0[causal]",
+    )
+
+    # Verify we can split it back
+    list_q2 = attn_bias.split_queries(q)
+    assert len(list_q) == len(list_q2)
+    for q1, q2 in zip(list_q, list_q2):
+        assert_allclose(q1, q2)
+    with pytest.raises(ValueError):
+        attn_bias.split_queries(k)
+    list_k2 = attn_bias.split_kv(k)
+    assert len(list_k) == len(list_k2)
+    for k1, k2 in zip(list_k, list_k2):
+        assert_allclose(k1, k2)
+
+
+def test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond() -> None:
+    list_q = [
+        torch.randn([1, 3, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+    ]
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+    with pytest.raises(ValueError):
+        attn_bias.make_causal_from_bottomright()
+
+
+def test_attn_bias_blockdiag_crossattn_causal_with_prefix() -> None:
+    # Q / KV have different seqlen
+    list_q = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 2, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 5, 1, 8]),
+    ]
+
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+    as_tensor = attn_bias.make_causal_from_bottomright().materialize(
+        (q.shape[1], k.shape[1])
+    )
+    m = -math.inf
+    assert_allclose(
+        as_tensor[0:2, 0:2],
+        torch.tensor([[0, m], [0, 0]], dtype=torch.float32),
+        "batch1.1[causal_with_prefix]",
+    )
+    assert_allclose(
+        as_tensor[2:4, 2:7],
+        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
+        "batch2.1[causal_with_prefix]",
+    )
+    assert_allclose(
+        as_tensor[4:6, 7:12],
+        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
+        "batch2.2[causal_with_prefix]",
+    )
+
+
+@cuda_only
+def test_attn_bias_padded() -> None:
+    bsize, n_heads, d, padding = 8, 3, 8, 32
+
+    # Q / KV have different seqlen
+    k = torch.randn((bsize, padding, n_heads, d)).cuda().half()
+    k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
+    other = bsize - 1
+    v = torch.randn((bsize, padding, n_heads, d)).cuda().half()
+    n_q_first = 4
+    q = [
+        torch.randn((1, n_q_first, n_heads, d)).cuda().half(),
+        torch.randn((1, other, n_heads, d)).cuda().half(),
+    ]
+    q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
+    # causal_diagonal = torch.tensor(
+    #     [0] + [i - 1 for i in k_seqlen[1:]], dtype=torch.int32
+    # ).cuda()
+
+    q_seqlen = [n_q_first] + [1] * other
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=q_seqlen,
+        kv_seqlen=k_seqlen,
+        kv_padding=padding,
+    )
+
+    v = v.view(1, -1, n_heads, d)
+    k = k.view(1, -1, n_heads, d)
+
+    scores = (q_cat.transpose(1, 2) @ k.transpose(1, 2).transpose(2, 3)).float()
+    assert not scores.isnan().any()
+    mask = torch.full_like(scores, -float("inf"))
+    for i, (slen, qlen) in enumerate(zip(k_seqlen, q_seqlen)):
+        kseq_start = i * padding
+        qstart = sum(q_seqlen[:i])
+        mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen] = torch.triu(
+            mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen].float(),
+            diagonal=1 + slen - qlen,
+        ).float()
+
+    scores += mask
+    assert not scores.isnan().any()
+    # 1,3,10,8 @ 1,3,8,256 -> 1,3,10,256
+    scores = torch.nn.functional.softmax(scores, -1).half()
+    # torch.Size([1, 3, 3, 32]) @ torch.Size([1, 3, 32, 8])
+    output = scores @ v.transpose(1, 2)  # 1,3,10,256 @ 1,3,256, 8 -> 1,3,10,8
+    output = output.transpose(1, 2).contiguous()
+
+    fmha_output = fmha.memory_efficient_attention_forward(
+        q_cat, k, v, attn_bias, scale=1.0
+    )
+
+    # assert torch.allclose(output, fmha_output)
+    assert_allclose(
+        output,
+        fmha_output,
+        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
+        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
+    )
+
+
+def test_attn_bias_from_seqlens() -> None:
+    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
+    out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
+    assert len(out) == 3
+    assert tuple(out[0].shape) == (1, 3, 16)
+
+
+@cuda_only
+def test_attn_bias_blockdiag_doc() -> None:
+    """IMPORTANT:
+    This is the example in the doc for `BlockDiagonalMask`.
+    If this example needs to be updated, please also update the doc
+    """
+    import torch
+
+    from xformers.ops import fmha
+
+    K = 16
+    dtype = torch.float16
+    device = "cuda"
+    list_x = [
+        torch.randn([1, 3, 1, K], dtype=dtype, device=device),
+        torch.randn([1, 6, 1, K], dtype=dtype, device=device),
+        torch.randn([1, 2, 1, K], dtype=dtype, device=device),
+    ]
+    attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
+
+    linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
+
+    q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
+    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+    list_out = attn_bias.split(out)
+    print(list_out[0].shape)  # [1, 3, 1, K]
+    assert tuple(list_out[0].shape) == (1, 3, 1, K)
+
+
+@cuda_only
+class TestAttnBias:
+    @staticmethod
+    def create_tensors(
+        dtype,
+        B: int = 2,
+        Mq: int = 32,
+        Mkv: int = 32,
+        H: int = 3,
+        K: int = 16,
+        Kv: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        return (
+            torch.randn([B, Mq, H, K], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, Mkv, H, K], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, Mkv, H, Kv], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, H, Mq, Mkv], device="cuda", dtype=dtype) * 3,
+        )
+
+    @staticmethod
+    def pad_bias(bias: torch.Tensor) -> torch.Tensor:
+        align_to = 16
+        if (bias.shape[-1] % align_to) == 0:
+            return bias
+        pad_count = align_to - (bias.shape[-1] % align_to)
+        return torch.nn.functional.pad(bias, [0, pad_count])[:, :, :, : bias.shape[-1]]
+
+    def test_f16_biasf32(self) -> None:
+        q, k, v, bias = self.create_tensors(torch.float16)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        bias = bias.to(torch.float32)
+        with pytest.raises((ValueError, RuntimeError)):
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+
+    def test_f32_biasf16(self) -> None:
+        q, k, v, bias = self.create_tensors(torch.float32)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        bias = bias.to(torch.float16)
+        with pytest.raises((ValueError, RuntimeError)):
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+    def test_wrong_alignment(self, dtype) -> None:
+        op = fmha.cutlass.FwOp
+        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
+        try:
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
+            return
+        except (ValueError, RuntimeError):
+            pass
+        # This case is not supported, likely due to padding issues
+        # Let's make sure it works with padding
+        assert bias.ndim == 4, bias.shape
+        bias_padded = self.pad_bias(bias)
+        out = fmha.memory_efficient_attention(
+            q, k, v, attn_bias=bias_padded, op=(op, None)
+        ).float()
+        ref_out = ref_attention_bmhk(q, k, v, bias)
+        assert_allclose(
+            out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
+        )
+
+    def test_permuted_attn_bias(self) -> None:
+        op = fmha.cutlass.FwOp
+        dtype = torch.float16
+        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
+        bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
+        # Either it works, or it raises an exception
+        # but we should never get a CUDA error
+        try:
+            out = fmha.memory_efficient_attention(
+                q, k, v, attn_bias=bias, op=(op, None)
+            ).float()
+            ref_out = ref_attention_bmhk(q, k, v, bias)
+            assert_allclose(
+                out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
+            )
+        except (ValueError, RuntimeError):
+            pass
+
+
+SM_AND_SHMEM_KBYTES = [
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+    (50, 64),
+    (60, 64),
+    (70, 96),
+    (75, 64),
+    (80, 163),
+    (86, 99),
+    (89, 99),
+    # (90, 227),
+]
+
+
+@cuda_only
+@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
+@pytest.mark.parametrize(
+    "sm_shmem",
+    SM_AND_SHMEM_KBYTES,
+    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
+)
+def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
+    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
+    sm, shmem_kbytes = sm_shmem
+    if sm < 80 and dtype_str == "bf16":
+        return
+
+    for k in [16, 32, 64, 128, 256]:
+        assert torch.ops.xformers._has_cutlassF_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+        assert torch.ops.xformers._has_cutlassB_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"

From efecc7d3675ca213d215ffa4604cfe7f2eca7db2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 24 Aug 2023 20:53:36 +0000
Subject: [PATCH 031/641] Update in test_mem_eff_attention_ck.py to make
 test_forward passed all suitable cases

---
 tests/test_ck_3.py                 | 437 +++++++++++++++++++++++++++++
 tests/test_mem_eff_attention_ck.py |  73 ++---
 xformers/ops/fmha/ck.py            |  12 +-
 3 files changed, 456 insertions(+), 66 deletions(-)
 create mode 100644 tests/test_ck_3.py

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
new file mode 100644
index 000000000..2c6e42860
--- /dev/null
+++ b/tests/test_ck_3.py
@@ -0,0 +1,437 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Set, Any
+
+import pytest
+import torch
+
+## need to FIX
+##from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from tests.utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        ##`small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
+            # make sure it also works if the first columns are partially masked out
+            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+## The same set of supported attn_bias types as defined by ck.FwOp
+SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        torch.Tensor,
+        fmha.attn_bias.LowerTriangularMask,
+        fmha.attn_bias.LowerTriangularMaskWithTensorBias,
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        }
+
+@pytest.mark.parametrize("bias_type", SUPPORTED_ATTN_BIAS_TYPES)
+@pytest.mark.parametrize("packed", [False, True])
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_forward(dtype, fmt, packed, bias_type):
+    op = fmha.ck.FwOp
+    device = torch.device("cuda")
+    batch_size = 7
+    q_len = 200
+
+    ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
+    if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+        kv_len = int(q_len * 1.2) 
+    else:
+        kv_len = q_len
+    h = 3 
+    k = 64 
+    kv = 64
+
+    if kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
+
+    if packed and not (k == kv and q_len == kv_len):
+        pytest.skip(
+            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
+        )
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
+        pytest.skip("BMK incompatible with this bias")
+
+    ## packed type always creates the tensors in "BMHK" even the fmt is "BMK", so for packed type, one
+    ## should always assume h is already merged in B, and set h to be 1
+    if packed and fmt is "BMK" and batch_size > 1 and h > 1:
+        pytest.skip("Shape of this is type is skipped")
+
+    query, key, value, attn_bias = create_tensors(
+        op, device, dtype, bias_type, batch_size, q_len, kv_len, h, k, kv, fmt="BMHK" if packed else fmt
+    )
+
+    ## when packed, the query, key, value is in BMHK format
+    if packed:
+        c = torch.stack([query, key, value], 2)
+        if fmt == "BMK":
+            # bm3hk -> 3bhmk -> 3Bmk
+            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            query, key, value = c[0], c[1], c[2]
+            # Re-create bias in the right format
+            attn_bias = create_attn_bias(
+                bias_type=bias_type,
+                batch_size=batch_size,
+                num_heads=h,
+                q_len=q_len,
+                kv_len=kv_len,
+                device=device,
+                dtype=dtype,
+                requires_grad=False,
+                fmt=fmt,
+                op=op,
+            )
+        else:
+            # bm3hk -> 3 x bmhk
+            query, key, value = xformers.ops.unbind(c, 2)
+
+        print("The query shaped for packed: ", query.size())
+        assert not query.is_contiguous()
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index bd083cdb8..be0c355a3 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -20,13 +20,8 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-compute_capability = (0, 0)
-if torch.cuda.is_available():
-    compute_capability = torch.cuda.get_device_capability("cuda")
-sm75_or_better_only = pytest.mark.skipif(
-    compute_capability < (7, 5), reason="requires sm75+"
-)
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_types = [torch.float16, torch.bfloat16]
 
 ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
     fmha.ck.FwOp,
@@ -45,11 +40,7 @@ def _filter_unsupported_ops(ops: Sequence[T]) -> Sequence[T]:
     return [
         op
         for op in ops
-        if (
-            "cpu" in op.SUPPORTED_DEVICES
-            or op.CUDA_MINIMUM_COMPUTE_CAPABILITY <= compute_capability
-        )
-        and op.is_available()
+        if op.is_available()
     ]
 
 
@@ -101,9 +92,8 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
             shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
     # Add some random shapes
     if op in [
-        fmha.cutlass.FwOp,
-        fmha.cutlass.BwOp,
-        fmha.flash.BwOp,
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
     ]:
         K_CHOICES = [8 * i for i in range(1, 256 // 8)]
         r = random.Random(0)
@@ -557,7 +547,6 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
         (0, 2, 1, 3)
     )
 
-
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
@@ -635,7 +624,7 @@ def test_forward(
         assert_allclose(
              out.float(),
              ref,
-             atol=2.5e-2,
+             atol=2.8e-2,
              rtol=1e-2,
         )
     else:
@@ -651,18 +640,22 @@ def test_forward(
 @pytest.mark.parametrize("batch_size", [1, 4])
 @pytest.mark.parametrize("kv_len", [128, 512])
 @pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("device", _devices)
-def test_key_query_all_ones(device, q_len, kv_len, batch_size, k_len):
+@pytest.mark.parametrize("device", [torch.device("cuda")])
+@pytest.mark.parametrize("test_type", _types)
+def test_key_query_all_ones(test_type, device, q_len, kv_len, batch_size, k_len):
     scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device)
-    key = torch.ones((batch_size, kv_len, k_len), device=device)
-    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=test_type)
+    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=test_type)
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=test_type) * scale
 
-    out = xformers.ops.memory_efficient_attention(query, key, value)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
     # this should be equivalent to the average over value
     ref = value.mean(1, keepdim=True).expand_as(query)
 
-    assert_allclose(out, ref, atol=1e-5)
+    if test_type is torch.float16:
+        assert_allclose(out, ref, atol=1e-5)
+    else:
+        assert_allclose(out, ref, atol=1e-2)
 
 
 def _block_diag_reshape_lse(
@@ -1026,18 +1019,6 @@ def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     )
 
 
-@cuda_only
-@pytest.mark.parametrize("p", [0.3, 0.7])
-@pytest.mark.parametrize("k", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
-@pytest.mark.parametrize("q_len", [2, 33])
-def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
-    _test_dropout_backward(
-        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
-    )
-
-
 @cuda_only
 @pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
 @pytest.mark.parametrize("k", [16, 128, 256])
@@ -1045,14 +1026,14 @@ def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
 @pytest.mark.parametrize("kv_len", [3, 248, 256])
 @pytest.mark.parametrize("q_len", [3, 248, 256])
 @pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
-def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
+def test_dropout_backward_ck(dt, q_len, kv_len, batch_size, k, p):
     _test_dropout_backward(
         q_len,
         kv_len,
         batch_size,
         k,
         p,
-        op=fmha.cutlass.FwOp,
+        op=fmha.ck.FwOp,
         dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
     )
 
@@ -1388,24 +1369,6 @@ def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
         q = q.contiguous()
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
 
-
-@sm75_or_better_only
-def test_unsupported_dropout_combine_flash_cutlass() -> None:
-    q = torch.empty(
-        [1, 4, 1, 16], device="cuda", dtype=torch.float16, requires_grad=True
-    )
-    with pytest.raises(ValueError):
-        out = fmha.memory_efficient_attention(
-            q, q, q, p=0.1, op=(fmha.cutlass.FwOp, fmha.flash.BwOp)
-        )
-        out.backward(out)
-    with pytest.raises(ValueError):
-        out = fmha.memory_efficient_attention(
-            q, q, q, p=0.1, op=(fmha.flash.FwOp, fmha.cutlass.BwOp)
-        )
-        out.backward(out)
-
-
 def test_attn_bias_causal() -> None:
     m = -math.inf
     causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 4bc21251d..f339b31e8 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -29,17 +29,7 @@
 )
 
 def _minimum_gemm_alignment(inp: Inputs) -> int:
-    if inp.device.type != "cuda":
-        return 1
-    bits_per_scalar = {torch.float: 32, torch.half: 16, torch.bfloat16: 16}[
-        inp.query.dtype
-    ]
-    ## for MI200/MI300 only
-    uses_tensorcores = True 
-    matmul_alignment_mn = 4 
-    if uses_tensorcores:
-        matmul_alignment_mn = max(matmul_alignment_mn, 128 // bits_per_scalar)
-    return matmul_alignment_mn
+    return 1 
 
 
 def _get_seqlen_info(

From da83285b53d325ca678d2af5e47174f8fa6083c6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 25 Aug 2023 16:57:35 +0000
Subject: [PATCH 032/641] Update test_mem_eff_attention_ck.py to make
 TestAttnBias/test_attn_bias_*/test_unsupported_xxx pass

---
 tests/test_mem_eff_attention_ck.py | 49 +++++-------------------------
 1 file changed, 8 insertions(+), 41 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index be0c355a3..58f4c8696 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1582,7 +1582,7 @@ def test_attn_bias_padded() -> None:
     output = output.transpose(1, 2).contiguous()
 
     fmha_output = fmha.memory_efficient_attention_forward(
-        q_cat, k, v, attn_bias, scale=1.0
+        q_cat, k, v, attn_bias, scale=1.0, op=fmha.ck.FwOp
     )
 
     # assert torch.allclose(output, fmha_output)
@@ -1624,7 +1624,7 @@ def test_attn_bias_blockdiag_doc() -> None:
     linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
 
     q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
-    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None))
     list_out = attn_bias.split(out)
     print(list_out[0].shape)  # [1, 3, 1, K]
     assert tuple(list_out[0].shape) == (1, 3, 1, K)
@@ -1659,21 +1659,22 @@ def pad_bias(bias: torch.Tensor) -> torch.Tensor:
 
     def test_f16_biasf32(self) -> None:
         q, k, v, bias = self.create_tensors(torch.float16)
-        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(fmha.ck.FwOp, None))
         bias = bias.to(torch.float32)
         with pytest.raises((ValueError, RuntimeError)):
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(fmha.ck.FwOp, None))
 
     def test_f32_biasf16(self) -> None:
+        pytest.skip("float32 is not supported currently by CK-FlashAttention-1")
         q, k, v, bias = self.create_tensors(torch.float32)
         fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
         bias = bias.to(torch.float16)
         with pytest.raises((ValueError, RuntimeError)):
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
 
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+    @pytest.mark.parametrize("dtype", [torch.float16])
     def test_wrong_alignment(self, dtype) -> None:
-        op = fmha.cutlass.FwOp
+        op = fmha.ck.FwOp
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
         try:
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
@@ -1693,7 +1694,7 @@ def test_wrong_alignment(self, dtype) -> None:
         )
 
     def test_permuted_attn_bias(self) -> None:
-        op = fmha.cutlass.FwOp
+        op = fmha.ck.FwOp
         dtype = torch.float16
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
         bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
@@ -1710,37 +1711,3 @@ def test_permuted_attn_bias(self) -> None:
         except (ValueError, RuntimeError):
             pass
 
-
-SM_AND_SHMEM_KBYTES = [
-    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
-    (50, 64),
-    (60, 64),
-    (70, 96),
-    (75, 64),
-    (80, 163),
-    (86, 99),
-    (89, 99),
-    # (90, 227),
-]
-
-
-@cuda_only
-@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
-@pytest.mark.parametrize(
-    "sm_shmem",
-    SM_AND_SHMEM_KBYTES,
-    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
-)
-def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
-    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
-    sm, shmem_kbytes = sm_shmem
-    if sm < 80 and dtype_str == "bf16":
-        return
-
-    for k in [16, 32, 64, 128, 256]:
-        assert torch.ops.xformers._has_cutlassF_kernel_for(
-            dtype, sm, shmem_kbytes * 1024, k
-        ), f"k={k}"
-        assert torch.ops.xformers._has_cutlassB_kernel_for(
-            dtype, sm, shmem_kbytes * 1024, k
-        ), f"k={k}"

From 97bc5516788b6f8966c716502739655527e4a0c6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 25 Aug 2023 18:41:03 +0000
Subject: [PATCH 033/641] Update to backward related C++ codes

---
 xformers/csrc/attention/attention.cpp         |  2 +-
 .../hip_fmha/attention_backward_generic.cpp   | 68 ++++++++++---------
 .../hip_fmha/attention_forward_generic.cpp    |  1 +
 .../hip_fmha/ck_fmha_batched_backward.h       |  4 +-
 .../ck_fmha_batched_backward_bp16.cpp         | 29 ++++++--
 .../ck_fmha_batched_backward_fp16.cpp         | 29 ++++++--
 .../hip_fmha/ck_fmha_batched_forward.h        |  2 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       |  4 +-
 .../ck_fmha_grouped_backward_bp16.cpp         | 30 ++++++--
 .../ck_fmha_grouped_backward_fp16.cpp         | 29 ++++++--
 .../hip_fmha/ck_fmha_grouped_forward.h        |  2 +-
 .../csrc/attention/hip_fmha/ck_fmha_util.h    |  2 +
 12 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index ee0e07cc2..2bb528d11 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -36,5 +36,5 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
+      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 1e73be6e9..ce9ce08ce 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -165,6 +165,10 @@ efficient_attention_backward_ck(
         static_cast<int>(grad_out.stride(3))};
 
     if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
       p.attn_bias_ptr = bias->data_ptr();
 
       const at::Tensor bias_4d_view =
@@ -235,6 +239,10 @@ efficient_attention_backward_ck(
         static_cast<int>(grad_out.stride(3))};
 
     if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true; 
       const at::Tensor bias_4d_view =
           get_bias_4d_view(*bias, B, num_heads, M, N);
       p.attn_bias_strides = {
@@ -242,7 +250,9 @@ efficient_attention_backward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
-    };
+    }
+    else 
+	p.has_attn_bias = false; 
 
     p.dropout_prob = static_cast<float>(dropout_p);
     p.rng_engine_inputs = rng_engine_inputs;
@@ -259,9 +269,6 @@ efficient_attention_backward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    if (seqlen_k.has_value())
-      p.host_seqlen_k.resize(p.num_batches);
-
     FMHA_HIP_CHECK(hipMemcpy(
         p.host_seqstart_q.data(),
         seqstart_q->data_ptr(),
@@ -279,6 +286,21 @@ efficient_attention_backward_ck(
           p.num_batches * sizeof(int),
           hipMemcpyDeviceToHost));
 
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+      CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqlen_k));
+
+      p.host_seqlen_k.resize(p.num_batches);
+
+      FMHA_HIP_CHECK(hipMemcpy(
+          p.host_seqlen_k.data(),
+          seqlen_k->data_ptr(),
+          p.num_batches * sizeof(int32_t),
+          hipMemcpyDeviceToHost));
+    }
+
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
     char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
     char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
@@ -312,26 +334,14 @@ efficient_attention_backward_ck(
               p.host_seqstart_k[i] * p.randvals_strides[2],
           randvals.scalar_type());
 
-      p.q_ptrs.push_back(reinterpret_cast<void*>(q_ptr));
-      p.grad_q_ptrs.push_back(reinterpret_cast<void*>(grad_q_ptr));
-
-      q_ptr = q_ptr + tmp_q_stride;
-      grad_q_ptr = grad_q_ptr + tmp_q_stride;
-
-      p.k_ptrs.push_back(reinterpret_cast<void*>(k_ptr));
-      p.grad_k_ptrs.push_back(reinterpret_cast<void*>(grad_k_ptr));
-      k_ptr = k_ptr + tmp_k_stride;
-      grad_k_ptr = grad_k_ptr + tmp_k_stride;
-
-      p.v_ptrs.push_back(reinterpret_cast<void*>(v_ptr));
-      p.grad_v_ptrs.push_back(reinterpret_cast<void*>(grad_v_ptr));
-      v_ptr = v_ptr + tmp_k_stride;
-      grad_v_ptr = grad_v_ptr + tmp_k_stride;
-
-      p.out_ptrs.push_back(reinterpret_cast<void*>(out_ptr));
-      p.grad_out_ptrs.push_back(reinterpret_cast<void*>(grad_out_ptr));
-      out_ptr = out_ptr + tmp_o_stride;
-      grad_out_ptr = grad_out_ptr + tmp_o_stride;
+      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_stride]));
+      p.grad_q_ptrs.push_back(reinterpret_cast<void*>(&grad_q_ptr[tmp_q_stride]));
+      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_stride]));
+      p.grad_k_ptrs.push_back(reinterpret_cast<void*>(&grad_k_ptr[tmp_k_stride]));
+      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_stride]));
+      p.grad_v_ptrs.push_back(reinterpret_cast<void*>(&grad_v_ptr[tmp_v_stride]));
+      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_stride]));
+      p.grad_out_ptrs.push_back(reinterpret_cast<void*>(&grad_out_ptr[tmp_grad_o_stride]));
 
       if (bias.has_value()) {
         int32_t tmp_bias_stride = get_size_in_bytes(
@@ -339,15 +349,11 @@ efficient_attention_backward_ck(
                 p.host_seqstart_k[i] * p.attn_bias_strides[3],
             bias->scalar_type());
 
-        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(attn_bias_ptr));
-        attn_bias_ptr = attn_bias_ptr + tmp_bias_stride;
+        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_stride]));
       };
 
-      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
-      logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
-
-      p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
-      randvals_ptr = randvals_ptr + tmp_randvals_stride;
+      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_stride]));
+      p.randvals_ptrs.push_back(reinterpret_cast<void*>(&randvals_ptr[tmp_randvals_stride]));
     }
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 652ef8092..6367cb517 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -235,6 +235,7 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
       p.has_attn_bias = true;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 1b14c772f..4ab846563 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -11,8 +11,8 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type = 0>
-void batched_backward_mask_type_dispatched(
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void batched_backward_masktype_attnbias_dispatched(
     BatchedBackwardParams& param,
     hipStream_t stream) {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 69b1e5065..9d55a2d6e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -4,12 +4,27 @@
 #include "ck_fmha_batched_backward.h"
 
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    batched_backward_mask_type_dispatched<ck::bhalf_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    batched_backward_mask_type_dispatched<ck::bhalf_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    batched_backward_mask_type_dispatched<ck::bhalf_t, 2>(param, stream);
-  else
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+          param, stream);
+    else
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+          param, stream);
+    else
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+          param, stream);
+    else
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(
+          param, stream);
+  } else
     throw std::runtime_error("Invalid custom_mask_type value");
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index 273a2ee06..77dd96de4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -4,12 +4,27 @@
 #include "ck_fmha_batched_backward.h"
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    batched_backward_mask_type_dispatched<ck::half_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    batched_backward_mask_type_dispatched<ck::half_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    batched_backward_mask_type_dispatched<ck::half_t, 2>(param, stream);
-  else
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+          param, stream);
+    else
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+          param, stream);
+    else
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+          param, stream);
+    else
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+          param, stream);
+  } else
     throw std::runtime_error("Invalid custom_mask_type value");
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index e8ce9302a..b2daa90c2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -11,7 +11,7 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type = 0, bool has_attn_bias>
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void batched_forward_masktype_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index bd86d7c32..1bba8b678 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -13,8 +13,8 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type = 0>
-void grouped_backward_mask_type_dispatched(
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void grouped_backward_masktype_attnbias_dispatched(
     GroupedBackwardParams& param,
     hipStream_t stream) {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index 3c76d137d..dbee4f9e0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -4,12 +4,28 @@
 #include "ck_fmha_grouped_backward.h"
 
 void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    grouped_backward_mask_type_dispatched<ck::bhalf_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    grouped_backward_mask_type_dispatched<ck::bhalf_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    grouped_backward_mask_type_dispatched<ck::bhalf_t, 2>(param, stream);
-  else
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+          param, stream);
+    else
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+          param, stream);
+    else
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+          param, stream);
+
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+          param, stream);
+    else
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+          param, stream);
+  } else
     throw std::runtime_error("Invalid custom_mask_type value");
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index 912023ca4..dd0c0f1b8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -4,12 +4,27 @@
 #include "ck_fmha_grouped_backward.h"
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    grouped_backward_mask_type_dispatched<ck::half_t, 0>(param, stream);
-  else if (param.custom_mask_type == 1)
-    grouped_backward_mask_type_dispatched<ck::half_t, 1>(param, stream);
-  else if (param.custom_mask_type == 2)
-    grouped_backward_mask_type_dispatched<ck::half_t, 2>(param, stream);
-  else
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+          param, stream);
+    else
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+          param, stream);
+    else
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+          param, stream);
+    else
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+          param, stream);
+  } else
     throw std::runtime_error("Invalid custom_mask_type value");
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 91e16df74..4f3d9a985 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -12,7 +12,7 @@
 
 #include "ck_fmha_util.h"
 
-template <typename scalar_t, int32_t custom_mask_type = 0, bool has_attn_bias>
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void grouped_forward_masktype_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 0aed26cf9..9ce11c399 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -279,6 +279,7 @@ struct BatchedBackwardParams {
   int Kv; // embed_dim for Value
 
   float scale;
+  bool has_attn_bias; 
 
   // BMHK mode strides, last-dim contiguous
   std::array<int, 4> q_strides;
@@ -331,6 +332,7 @@ struct GroupedBackwardParams {
   std::vector<int> host_seqlen_k;
 
   float scale;
+  bool has_attn_bias; 
 
   // MHK mode strides, last-dim contiguous
   std::array<int, 3> q_strides;

From 8d4024ce67786eebe39ac5136d77c35af38f3feb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 29 Aug 2023 09:08:09 +0000
Subject: [PATCH 034/641] Update to test_mem_eff_attention_ck.py

---
 tests/test_mem_eff_attention_ck.py | 4 +---
 xformers/ops/fmha/ck.py            | 7 ++++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 58f4c8696..ab3e2826a 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -9,7 +9,7 @@
 
 import pytest
 import torch
-##from scipy.stats import binomtest
+from scipy.stats import binomtest
 from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
@@ -894,7 +894,6 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 
 
 ### disable this test due to the un-availability of binomtest
-'''
 @cuda_only
 @pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
 @pytest.mark.parametrize("seed", [42, 124])
@@ -946,7 +945,6 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     masks = masks.sum(0).flatten()
     p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
     assert all(p_values > p_val_tol)
-'''
 
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     if dtype is torch.bfloat16 and compute_capability < (8, 0):
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index f339b31e8..d4e03238e 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -330,9 +330,10 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             inp.query,
             inp.key,
             inp.value,
-            _get_tensor_bias(inp.attn_bias),
-            cu_seqlens_q=seqstart_q,
-            cu_seqlens_k=seqstart_k,
+            attn_bias=_get_tensor_bias(inp.attn_bias),
+            seqstart_q=seqstart_q,
+            seqstart_k=seqstart_k,
+            seqlen_k=None,
             logsumexp=ctx.get_padded_lse(32, force_pad_inf=force_pad_inf),
             output=ctx.out.to(dtype),
             dropout_p=inp.p,

From 1fd480a499f446ec477939b5a169a6ca3a82f74f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 29 Aug 2023 12:40:11 +0000
Subject: [PATCH 035/641] Move the change in test_forward testing threshold to
 xformers/ops/fmha/ck.py

---
 tests/test_mem_eff_attention_ck.py | 10 +---------
 xformers/ops/fmha/ck.py            |  2 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index ab3e2826a..a7bddf41b 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -620,15 +620,7 @@ def test_forward(
 
     ref = ref_attention(query, key, value, attn_bias)
     assert out.shape == ref.shape, out.shape
-    if dtype is torch.bfloat16:
-        assert_allclose(
-             out.float(),
-             ref,
-             atol=2.8e-2,
-             rtol=1e-2,
-        )
-    else:
-        assert_allclose(
+    assert_allclose(
              out.float(),
              ref,
              atol=op.ERROR_ATOL[dtype],
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index d4e03238e..f11762422 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -150,7 +150,7 @@ class FwOp(AttentionFwOpBase):
     ERROR_ATOL: Mapping[torch.dtype, float] = {
         torch.float: 3e-4,
         torch.half: 4e-3,
-        torch.bfloat16: 2e-2,
+        torch.bfloat16: 2.8e-2,
     }
     ERROR_RTOL: Mapping[torch.dtype, float] = {
         torch.float: 2e-5,

From e12ebafb4968b34ce0f11e89399b3f518f281558 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 30 Aug 2023 12:32:01 +0000
Subject: [PATCH 036/641] Update to test_mem_eff_attention_ck and
 readme_test_on_rocm.txt

---
 tests/readme_test_on_rocm.txt      | 21 +++++++++++++++++++++
 tests/test_mem_eff_attention_ck.py | 16 ++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index 5b5ce25aa..392a2a427 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -5,4 +5,25 @@
 
       pytest -k test_forward tests/test_mem_eff_attention_ck.py
 
+   3. The following tests in tests/memory_eff_attention_ck.py have passed
+
+      * test_forward
+      * test_key_query_all_ones
+      * test_logsumexp
+      * test_attn_bias 
+        - test_attn_bias_causal
+        - test_attn_bias_torch_tensor 
+        - test_attn_bias_blockdiag
+        - test_attn_bias_blockdiag_batched
+        - test_attn_bias_blockdiag_crossattn_causal
+        - test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond
+        - test_attn_bias_blockdiag_crossattn_causal_with_prefix()
+        - test_attn_bias_padded
+        - test_attn_bias_from_seqlens
+        - test_attn_bias_blockdiag_doc
+      * test_unsupported_cpu
+      * test_unsupported_stride_lastdim
+      * test_unsupported_stride_alignment
+      * test_cuda_streams
+
 
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index a7bddf41b..228ab0971 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -738,8 +738,8 @@ def test_backward(
             fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
             seed=q_len * kv + kv_len * k,
         )
-        if op_bw != fmha.cutlass.BwOp
-        else fmha.cutlass.FwOp
+        if op_bw != fmha.ck.BwOp
+        else fmha.ck.FwOp
     )
     qkv = None
 
@@ -773,7 +773,7 @@ def test_backward(
 
     out.backward(grad_out)
 
-    if qkv is None and op_bw == fmha.cutlass.BwOp:
+    if qkv is None and op_bw == fmha.ck.BwOp:
         assert query.stride() == query.grad.stride()
 
     grads = []
@@ -873,7 +873,7 @@ def _vec_binom_test(x, n, p):
 
 
 def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
-    if op == fmha.cutlass.FwOp:
+    if op == fmha.ck.FwOp:
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
         rand_uniform = torch.ops.xformers._cutlass_rand_uniform(p, mask)
         mask = (rand_uniform > p).to(torch.float32)
@@ -1097,11 +1097,11 @@ def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
     value.requires_grad_(True)
 
     out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query, key, value, attn_bias
+        query, key, value, attn_bias, op=fmha.ck.FwOp
     )
     assert out.ndim == query.ndim
     dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
-        grad_out, out, lse, query, key, value, attn_bias
+        grad_out, out, lse, query, key, value, attn_bias, op=fmha.ck.BwOp
     )
     assert dq.shape == query.shape
     assert dk.shape == key.shape
@@ -1579,8 +1579,8 @@ def test_attn_bias_padded() -> None:
     assert_allclose(
         output,
         fmha_output,
-        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
-        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
+        atol=fmha.ck.FwOp.ERROR_ATOL[torch.float16],
+        rtol=fmha.ck.FwOp.ERROR_RTOL[torch.float16],
     )
 
 
From 9907061d588078622bb194d1e2e56a0a0ab1eec7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 30 Aug 2023 16:24:26 +0000
Subject: [PATCH 037/641] Update C++ extension to add bias support for backward
 due to enabled by ck-flashAttn

---
 third_party/composable_kernel                 |  2 +-
 .../hip_fmha/ck_fmha_batched_backward.h       | 86 ++++++++++++-------
 .../hip_fmha/ck_fmha_grouped_backward.h       | 86 ++++++++++++-------
 3 files changed, 109 insertions(+), 65 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 226355e7e..4c8b47c04 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 226355e7e885881cdd904aec4df872fedb5447cd
+Subproject commit 4c8b47c04d8fe9d3e7074bf207590eee833fa51f
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 4ab846563..9c2466214 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -5,9 +5,9 @@
 
 #include <ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
 #include "ck_fmha_util.h"
 
@@ -28,8 +28,9 @@ void batched_backward_masktype_attnbias_dispatched(
   using ShuffleDataType = F32;
   using LSEDataType = F32;
   using ZDataType = unsigned short;
-  using Acc0BiasDataType = ck::Tuple<>;
-  using Acc1BiasDataType = ck::Tuple<>;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
 
   static constexpr ck::index_t NumDimG = 2;
   static constexpr ck::index_t NumDimM = 1;
@@ -56,8 +57,13 @@ void batched_backward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
+  // Tunables
+  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
   using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1<
+      DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
           NumDimG,
           NumDimM,
           NumDimN,
@@ -84,42 +90,47 @@ void batched_backward_masktype_attnbias_dispatched(
           TensorSpecY,
           1,
           256,
-          128, // MPerBlock
+          64, // MPerBlock
           128, // NPerBlock
-          64, // KPerBlock
-          64, // Gemm1NPerBlock
+          128, // KPerBlock
+          128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           8, // AK1
           8, // BK1
-          2, // B1K1
+          2, // A1K1
           32, // MPerXDL
           32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
-          2, // Gemm1NXdlPerWave
-          2, // Gemm2NXdlPerWave
+          2, // MXdlPerWave
+          1, // NXdlPerWave
+          4, // Gemm1NXdlPerWave
+          1, // Gemm2NXdlPerWave
           S<4, 64, 1>, // ABlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
-          S<4, 64, 1>, // BBlockTransfer
+          S<4, 64, 1>, // B0BlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
+          Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+          S<8, 32, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          2,
+          false,
           1, // CShuffleMXdlPerWavePerShuffle
-          2, // CShuffleNXdlPerWavePerShuffle
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
-          MaskingSpec, // MaskingSpecialization
+          4, // CShuffleNXdlPerWavePerShuffle
+          S<1, 32, 1, 8>,
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          MaskingSpec,
           Deterministic>;
 
   std::vector<ck::index_t> q_gs_ms_ks_lengths{
@@ -167,6 +178,21 @@ void batched_backward_masktype_attnbias_dispatched(
 
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
+  std::vector<ck::index_t> d_gs_ms_ns_lengths;
+  std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+  if constexpr (has_attn_bias) {
+    d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+    d_gs_ms_ns_strides = {
+        param.attn_bias_strides[0],
+        param.attn_bias_strides[1],
+        param.attn_bias_strides[2],
+        param.attn_bias_strides[3]};
+  } else {
+    d_gs_ms_ns_lengths = {1, 1, 1, 1};
+    d_gs_ms_ns_strides = {0, 0, 0, 0};
+  };
+
   float alpha = param.scale;
 
   auto op = DeviceOpInstance{};
@@ -183,8 +209,8 @@ void batched_backward_masktype_attnbias_dispatched(
       param.grad_q_ptr,
       param.grad_k_ptr,
       param.grad_v_ptr,
-      {}, // std::array<void*, 1> p_acc0_biases;
-      {}, // std::array<void*, 1> p_acc1_biases;
+      param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+      nullptr, // p_acc1_bias
       q_gs_ms_ks_lengths,
       q_gs_ms_ks_strides,
       k_gs_ns_ks_lengths,
@@ -196,14 +222,10 @@ void batched_backward_masktype_attnbias_dispatched(
       y_gs_ms_os_lengths,
       y_gs_ms_os_strides,
       lse_gs_ms_lengths,
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc0_biases_gs_ms_ns_lengths},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc0_biases_gs_ms_ns_strides},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc1_biases_gs_ms_os_lengths},
-      {}, // std::array<std::vector<ck::index_t>,
-          // 1>{acc1_biases_gs_ms_os_strides},
+      d_gs_ms_ns_lengths,
+      d_gs_ms_ns_strides,
+      {}, // acc1_biases_gs_ms_os_lengths
+      {}, // acc1_biases_gs_ms_os_strides
       QKVElementOp{},
       QKVElementOp{},
       Scale{alpha},
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 1bba8b678..620ebf26c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -5,11 +5,10 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v1.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/sequence.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
 #include "ck_fmha_util.h"
 
@@ -30,8 +29,9 @@ void grouped_backward_masktype_attnbias_dispatched(
   using ShuffleDataType = F32;
   using LSEDataType = F32;
   using ZDataType = unsigned short;
-  using Acc0BiasDataType = ck::Tuple<>;
-  using Acc1BiasDataType = ck::Tuple<>;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
 
   static constexpr ck::index_t NumDimG = 2;
   static constexpr ck::index_t NumDimM = 1;
@@ -58,8 +58,13 @@ void grouped_backward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
+  // Tunables
+  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
   using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1<
+      DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
           NumDimG,
           NumDimM,
           NumDimN,
@@ -86,42 +91,47 @@ void grouped_backward_masktype_attnbias_dispatched(
           TensorSpecY,
           1,
           256,
-          128, // MPerBlock
+          64, // MPerBlock
           128, // NPerBlock
-          64, // KPerBlock
-          64, // Gemm1NPerBlock
+          128, // KPerBlock
+          128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           8, // AK1
           8, // BK1
           2, // B1K1
           32, // MPerXDL
           32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
-          2, // Gemm1NXdlPerWave
-          2, // Gemm2NXdlPerWave
+          2, // MXdlPerWave
+          1, // NXdlPerWave
+          4, // Gemm1NXdlPerWave
+          1, // Gemm2NXdlPerWave
           S<4, 64, 1>, // ABlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
-          S<4, 64, 1>, // BBlockTransfer
+          S<4, 64, 1>, // B0BlockTransfer
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
-          8,
+          ABBlockTransferSrcScalarPerVector, // TUNABLE
           8,
           true,
+          Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+          S<8, 32, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          2,
+          false,
           1, // CShuffleMXdlPerWavePerShuffle
-          2, // CShuffleNXdlPerWavePerShuffle
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          CShuffleBlockTransferScalarPerVector_NPerBlock, // CShuffleBlockTransferScalarPerVector_NPerBlock
-          MaskingSpec, // MaskingSpecialization
+          4, // CShuffleNXdlPerWavePerShuffle
+          S<1, 32, 1, 8>,
+          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          MaskingSpec,
           Deterministic>;
 
   std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
@@ -162,6 +172,22 @@ void grouped_backward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
 
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {1, G1, M, N};
+      d_gs_ms_ns_strides = {
+          0,
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2]};
+
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    };
+
     problem_descs.push_back({
         q_gs_ms_ks_lengths,
         q_gs_ms_ks_strides,
@@ -175,14 +201,10 @@ void grouped_backward_masktype_attnbias_dispatched(
         y_gs_ms_os_strides,
         lse_gs_ms_lengths,
         lse_gs_ms_strides,
-        {}, // std::array<std::vector<ck::index_t>,
-            // 1>{acc0_biases_gs_ms_ns_lengths},
-        {}, // std::array<std::vector<ck::index_t>,
-            // 1>{acc0_biases_gs_ms_ns_strides},
-        {}, // std::array<std::vector<ck::index_t>,
-            // 1>{acc1_biases_gs_ms_os_lengths},
-        {}, // std::array<std::vector<ck::index_t>,
-            // 1>{acc1_biases_gs_ms_os_strides},
+        d_gs_ms_ns_lengths,
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides
     });
   }
 
@@ -202,8 +224,8 @@ void grouped_backward_masktype_attnbias_dispatched(
       param.grad_q_ptrs,
       param.grad_k_ptrs,
       param.grad_v_ptrs,
-      {}, // std::array<void*, 1> p_acc0_biases;
-      {}, // std::array<void*, 1> p_acc1_biases;
+      param.attn_bias_ptrs,
+      {}, // p_acc1_bias_vec;
       problem_descs,
       QKVElementOp{},
       QKVElementOp{},

From 94be1647fdd54b53fd656b2ad95b6fc216c755e9 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 1 Sep 2023 15:44:42 +0000
Subject: [PATCH 038/641] Synchronize the updates in test_mem_eff_attention.py
 to test_mem_eff_attention_ck.py

---
 tests/test_mem_eff_attention_ck.py | 161 ++++++++++++++++++++---------
 1 file changed, 115 insertions(+), 46 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 228ab0971..0d20a1092 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -20,9 +20,14 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
 _types = [torch.float16, torch.bfloat16]
 
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
 ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
     fmha.ck.FwOp,
 ]
@@ -31,23 +36,6 @@
     fmha.ck.BwOp,
 ]
 
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-
-def _filter_unsupported_ops(ops: Sequence[T]) -> Sequence[T]:
-    return [
-        op
-        for op in ops
-        if op.is_available()
-    ]
-
-
-ALL_FW_OPS = _filter_unsupported_ops(ALL_FW_OPS)
-ALL_BW_OPS = _filter_unsupported_ops(ALL_BW_OPS)
-
-
 def sample_random_supported_fw(
     inp: fmha.Inputs, seed: int
 ) -> Type[fmha.common.AttentionFwOpBase]:
@@ -64,7 +52,7 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
     shapes = []
     for B in op._TEST_BATCH_SIZES:
         for Mq in [32, 256]:
-            for Mkv in [32, 64, 256]:
+            for Mkv in [32, 64, 256, 1024]:
                 for K in op._TEST_K:
                     shapes.append((B, Mq, Mkv, 1, K, K))
         Mq = 256
@@ -75,7 +63,7 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
         for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
             shapes.append((B, M, Mkv, H, K, K))
             shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 256 + 2, 256 + 8, 512]:
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
             if _K <= op.SUPPORTED_MAX_K:
                 shapes.append((B, Mq, Mkv, H, _K, _K))
         # Different value for K / Kv
@@ -90,6 +78,17 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
         # Some number of heads
         for H in [3, 5, 12]:
             shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Filter-out not supported shapes
+    shapes = [
+        shape
+        for shape in shapes
+        if len(
+            op.shape_not_supported_reasons(
+                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
+            )
+        )
+        == 0
+    ]
     # Add some random shapes
     if op in [
         fmha.ck.FwOp,
@@ -97,7 +96,8 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
     ]:
         K_CHOICES = [8 * i for i in range(1, 256 // 8)]
         r = random.Random(0)
-        for _ in range(20):
+        found_count = 0
+        while found_count < 20:
             B = r.randint(1, 400)
             Mq = r.randint(1, 500)
             Mkv = r.randint(1, 500)
@@ -107,10 +107,20 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
             Kv = r.choice(K_CHOICES)
             if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
                 Kv = K
+            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
+                continue
+            found_count += 1
             shapes.append((B, Mq, Mkv, H, K, Kv))
     return shapes
 
 
+def make_id(op, device, dtype, bias_type, *shape):
+    return (
+        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+        f"-{'-'.join([str(s) for s in shape])}"
+    )
+
+
 def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
 ):
@@ -120,9 +130,7 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     for op in ops_list:
         op_count = 0
         # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(
-            sorted(list(op.SUPPORTED_ATTN_BIAS_TYPES), key=lambda x: str(x))
-        )
+        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
         for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
             has_one = False
             for device in _devices:
@@ -176,13 +184,9 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
                     continue
                 for dtype in op.SUPPORTED_DTYPES:
                     combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
     return {
         "argvalues": combination,
-        "ids": ids,
+        "ids": [make_id(*c) for c in combination],
     }
 
 
@@ -396,7 +400,6 @@ def create_attn_bias(
                 device=device,
                 dtype=dtype,
             )
-
             # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
             #       with the data read by one-thread
             # make sure it also works if the first columns are partially masked out
@@ -404,6 +407,8 @@ def create_attn_bias(
 
         if requires_grad:
             attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
         return attn_bias
     if bias_type is fmha.attn_bias.LowerTriangularMask:
         return fmha.attn_bias.LowerTriangularMask()
@@ -547,6 +552,7 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
         (0, 2, 1, 3)
     )
 
+
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
@@ -567,7 +573,7 @@ def test_forward(
         k,
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-   
+
     if kv > 128:
         pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
 
@@ -621,11 +627,11 @@ def test_forward(
     ref = ref_attention(query, key, value, attn_bias)
     assert out.shape == ref.shape, out.shape
     assert_allclose(
-             out.float(),
-             ref,
-             atol=op.ERROR_ATOL[dtype],
-             rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-        )
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
 
 
 @pytest.mark.parametrize("k_len", [5, 6, 32])
@@ -633,23 +639,22 @@ def test_forward(
 @pytest.mark.parametrize("kv_len", [128, 512])
 @pytest.mark.parametrize("q_len", [128, 512])
 @pytest.mark.parametrize("device", [torch.device("cuda")])
-@pytest.mark.parametrize("test_type", _types)
-def test_key_query_all_ones(test_type, device, q_len, kv_len, batch_size, k_len):
+@pytest.mark.parametrize("dtype", _types)
+def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
     scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=test_type)
-    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=test_type)
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=test_type) * scale
+    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
+    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
 
     out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
     # this should be equivalent to the average over value
     ref = value.mean(1, keepdim=True).expand_as(query)
 
-    if test_type is torch.float16:
+    if dtype is torch.float16:
         assert_allclose(out, ref, atol=1e-5)
     else:
         assert_allclose(out, ref, atol=1e-2)
 
-
 def _block_diag_reshape_lse(
     lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
 ) -> torch.Tensor:
@@ -875,7 +880,7 @@ def _vec_binom_test(x, n, p):
 def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
     if op == fmha.ck.FwOp:
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
-        rand_uniform = torch.ops.xformers._cutlass_rand_uniform(p, mask)
+        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
         mask = (rand_uniform > p).to(torch.float32)
         mask = mask.reshape(batch_size, q_len, kv_len)
     else:
@@ -885,7 +890,6 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
     return mask
 
 
-### disable this test due to the un-availability of binomtest
 @cuda_only
 @pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
 @pytest.mark.parametrize("seed", [42, 124])
@@ -938,6 +942,7 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
     assert all(p_values > p_val_tol)
 
+
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     if dtype is torch.bfloat16 and compute_capability < (8, 0):
         pytest.skip("bf16 requires Sm80")
@@ -1009,6 +1014,18 @@ def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     )
 
 
+@cuda_only
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
+@pytest.mark.parametrize("q_len", [2, 33])
+def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len, kv_len, batch_size, k, p, op=fmha.ck.FwOp, dtype=torch.float16
+    )
+
+
 @cuda_only
 @pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
 @pytest.mark.parametrize("k", [16, 128, 256])
@@ -1334,7 +1351,7 @@ def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
 )
 def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
     q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
-        0, 1, 3, 2
+        0, 3, 1, 2
     )
     try:
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
@@ -1350,7 +1367,7 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
     "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
 )
 def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 2, 2, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
+    q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
     try:
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
     except ValueError as e:
@@ -1584,6 +1601,57 @@ def test_attn_bias_padded() -> None:
     )
 
 
+@pytest.mark.parametrize("op", [fmha.decoder.FwOp])
+@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "")
+@pytest.mark.parametrize("n_heads", [1, 16, 32])
+@pytest.mark.parametrize("padding", [32, 4096])
+@pytest.mark.parametrize("bsz", [1, 8])
+@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
+def test_decoder(
+    op, multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str
+) -> None:
+    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
+    torch.manual_seed(1)
+    d = 128
+    k_shape = (1, bsz * padding, n_heads, d)
+    # TODO: support 2 kv heads etc.
+    k = torch.randn(k_shape, dtype=dtype_).cuda()
+    k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
+    v = torch.randn(k_shape, dtype=dtype_).cuda()
+    q = torch.randn((1, bsz, n_heads, d), dtype=dtype_).cuda()
+    causal_diagonal = torch.tensor(  # TODO: make unnecessary
+        [i - 1 for i in k_seqlen], dtype=torch.int32
+    ).cuda()
+
+    if multiquery:
+        k = k[:, :, :1].expand(k_shape)
+        v = v[:, :, :1].expand(k_shape)
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=[1] * bsz,
+        kv_seqlen=k_seqlen,
+        causal_diagonal=causal_diagonal,
+        kv_padding=padding,
+    )
+    inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
+    if not op.supports(inp):
+        pytest.skip("not supported")
+
+    decoder_output = fmha.memory_efficient_attention_forward(
+        q, k, v, attn_bias, op=fmha.decoder.FwOp
+    )
+
+    ck_output = fmha.memory_efficient_attention_forward(
+        q, k, v, attn_bias, op=fmha.ck.FwOp
+    )
+    assert_allclose(
+        decoder_output,
+        ck_output,
+        atol=fmha.ck.FwOp.ERROR_ATOL[dtype_] * 4,
+        rtol=fmha.ck.FwOp.ERROR_RTOL[dtype_],
+    )
+
+
 def test_attn_bias_from_seqlens() -> None:
     bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
     out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
@@ -1701,3 +1769,4 @@ def test_permuted_attn_bias(self) -> None:
         except (ValueError, RuntimeError):
             pass
 
+# end of file

From ee90d6b4d8c899ed89cf338011364fa946b4b2ca Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 3 Sep 2023 23:54:33 +0000
Subject: [PATCH 039/641] Add _ck_rand_uniform() interface to c++ extension

---
 tests/test_mem_eff_attention_ck.py            |   4 +-
 xformers/csrc/attention/attention.cpp         |   2 +
 .../hip_fmha/attention_ck_rand_uniform.cpp    | 104 ++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 0d20a1092..8a44de2d8 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -880,8 +880,10 @@ def _vec_binom_test(x, n, p):
 def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
     if op == fmha.ck.FwOp:
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
+        ## rand_uniform is an int32 tensor
         rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
-        mask = (rand_uniform > p).to(torch.float32)
+        mask = (rand_uniform > int(p*65535)).to(torch.float32)
+        print("call _ck_rand_uniform passed")
         mask = mask.reshape(batch_size, q_len, kv_len)
     else:
         mask = torch.empty((batch_size, q_len, kv_len), device=device)
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 40922e241..a837d1c19 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -39,4 +39,6 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
new file mode 100644
index 000000000..b786b0837
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ATen/ATen.h>
+#include <c10/core/TensorOptions.h>
+#include <torch/library.h>
+#include <torch/types.h>
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_batched_dropout.hpp"
+
+namespace {
+
+/**
+ * generate a tensor with random uniform values. only used for testing, not much
+ * attention is paid to performance
+ */
+at::Tensor rand_uniform_int(
+    double dropout_prob,
+    const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
+{
+  int B = out_pattern.size(0);
+  int num_heads = out_pattern.size(1);
+  int M = out_pattern.size(2);
+  int N = out_pattern.size(3);
+
+  at::Tensor randvals;
+
+  randvals = at::empty(
+      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+
+  using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedDropout<
+      2, // NumDimG
+      ck::half_t,
+      int,
+      ck::half_t,
+      GemmSpec,
+      TensorSpecA,
+      TensorSpecB0,
+      TensorSpecB1,
+      TensorSpecC,
+      256, // BlockSize
+      64, // MPerBlock
+      128, // NPerBlock
+      32, // KPerBlock
+      8, // AK1
+      8, // BK1
+      32, // MPerXDL
+      32, // NPerXDL
+      2, // MXdlPerWave
+      1>; // NXdlPerWave
+
+  const uint64_t seed = 1;
+  const uint64_t offset = 0;
+
+  std::vector<ck::index_t> z_gs_ms_ns_lengths = {B, num_heads, M, N};
+  std::vector<ck::index_t> z_gs_ms_ns_strides = {
+      static_cast<int>(randvals.stride(0)),
+      static_cast<int>(randvals.stride(1)),
+      static_cast<int>(randvals.stride(2)),
+      static_cast<int>(randvals.stride(3))};
+
+  auto dropout_op = DeviceOpInstance();
+  auto dropout_invoker = dropout_op.MakeInvoker();
+
+  auto dropout_arg = dropout_op.MakeArgument(
+      static_cast<int*>(randvals.data_ptr()),
+      z_gs_ms_ns_lengths,
+      z_gs_ms_ns_strides,
+      {seed, offset});
+
+  dropout_invoker.Run(dropout_arg, StreamConfig{nullptr, false});
+
+  return randvals;
+} // namespace
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"),
+      TORCH_FN(rand_uniform_int));
+}

From bf7401c9266886e8351085e2b3f8b74e67508eba Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 4 Sep 2023 12:41:25 +0000
Subject: [PATCH 040/641] Use hipMemcpyAsync() to replace hipMemcpy() to avoid
 some failure while running benchmark_mem_eff_attn_decoder.py

---
 .../hip_fmha/attention_forward_generic.cpp        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 6367cb517..f6dd8e3d8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -254,16 +254,18 @@ efficient_attention_forward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    FMHA_HIP_CHECK(hipMemcpy(
+    FMHA_HIP_CHECK(hipMemcpyAsync(
         p.host_seqstart_q.data(),
         seqstart_q->data_ptr(),
         (p.num_batches + 1) * sizeof(int32_t),
-        hipMemcpyDeviceToHost));
-    FMHA_HIP_CHECK(hipMemcpy(
+        hipMemcpyDeviceToHost,
+        stream));
+    FMHA_HIP_CHECK(hipMemcpyAsync(
         p.host_seqstart_k.data(),
         seqstart_k->data_ptr(),
         (p.num_batches + 1) * sizeof(int32_t),
-        hipMemcpyDeviceToHost));
+        hipMemcpyDeviceToHost,
+        stream));
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
@@ -273,11 +275,12 @@ efficient_attention_forward_ck(
 
       p.host_seqlen_k.resize(p.num_batches);
 
-      FMHA_HIP_CHECK(hipMemcpy(
+      FMHA_HIP_CHECK(hipMemcpyAsync(
           p.host_seqlen_k.data(),
           seqlen_k->data_ptr(),
           p.num_batches * sizeof(int32_t),
-          hipMemcpyDeviceToHost));
+          hipMemcpyDeviceToHost,
+          stream));
     }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());

From 973d5f44e4ca722303d104ba97328b4ed3dc43a6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 4 Sep 2023 18:33:45 +0000
Subject: [PATCH 041/641] Update in SimpleDeviceMem

---
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 9ce11c399..851c8dbda 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -8,10 +8,10 @@
 
 #include <torch/torch.h>
 
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ck/ck.hpp>
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
 
 // Here flag can be a constant, variable or function call
 #define FMHA_HIP_CHECK(ret_or_call)                                          \
@@ -166,17 +166,17 @@ struct MaxVectorSizeForType<ck::bhalf_t> {
 
 struct SimpleDeviceMem {
   SimpleDeviceMem() = delete;
-  SimpleDeviceMem(std::size_t mem_size) : p_mem_{} {
-    FMHA_HIP_CHECK(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
+  SimpleDeviceMem(std::size_t mem_size) {
+    auto options = torch::TensorOptions();
+    mem = at::empty(
+        mem_size, options.dtype(at::ScalarType::Byte).device(torch::kCUDA));
   }
   void* GetDeviceBuffer() {
-    return p_mem_;
-  }
-  ~SimpleDeviceMem() {
-    (void)hipFree(p_mem_);
+    return mem.data_ptr();
   }
+  ~SimpleDeviceMem() {}
 
-  void* p_mem_;
+  at::Tensor mem;
 };
 
 struct BatchedInferParams {
@@ -279,7 +279,7 @@ struct BatchedBackwardParams {
   int Kv; // embed_dim for Value
 
   float scale;
-  bool has_attn_bias; 
+  bool has_attn_bias;
 
   // BMHK mode strides, last-dim contiguous
   std::array<int, 4> q_strides;
@@ -332,7 +332,7 @@ struct GroupedBackwardParams {
   std::vector<int> host_seqlen_k;
 
   float scale;
-  bool has_attn_bias; 
+  bool has_attn_bias;
 
   // MHK mode strides, last-dim contiguous
   std::array<int, 3> q_strides;

From 6b6e3705cf01e838d4353cc87febafead3e0a239 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 4 Sep 2023 19:29:42 +0000
Subject: [PATCH 042/641] Misc updates in
 attention_forward/backward_generic.cpp

---
 .../hip_fmha/attention_backward_generic.cpp   | 56 +++++++++++--------
 .../hip_fmha/attention_forward_generic.cpp    | 11 +++-
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index ce9ce08ce..0faf23be9 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -79,6 +79,11 @@ efficient_attention_backward_ck(
   TORCH_CHECK(query.size(3) == key.size(3));
   TORCH_CHECK(value.size(3) == grad_out.size(3));
 
+  // Query, Key, Value must use the same CUDA device
+  TORCH_CHECK(query.device() == key.device());
+  TORCH_CHECK(query.device() == value.device());
+  TORCH_CHECK(query.device().type() == torch::kCUDA)
+
   // handle potentially non-contiguous grad_out through a copy
   CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
 
@@ -242,7 +247,7 @@ efficient_attention_backward_ck(
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
-      p.has_attn_bias = true; 
+      p.has_attn_bias = true;
       const at::Tensor bias_4d_view =
           get_bias_4d_view(*bias, B, num_heads, M, N);
       p.attn_bias_strides = {
@@ -250,9 +255,8 @@ efficient_attention_backward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
-    }
-    else 
-	p.has_attn_bias = false; 
+    } else
+      p.has_attn_bias = false;
 
     p.dropout_prob = static_cast<float>(dropout_p);
     p.rng_engine_inputs = rng_engine_inputs;
@@ -269,22 +273,18 @@ efficient_attention_backward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    FMHA_HIP_CHECK(hipMemcpy(
+    FMHA_HIP_CHECK(hipMemcpyAsync(
         p.host_seqstart_q.data(),
         seqstart_q->data_ptr(),
         (p.num_batches + 1) * sizeof(int),
-        hipMemcpyDeviceToHost));
-    FMHA_HIP_CHECK(hipMemcpy(
+        hipMemcpyDeviceToHost,
+        stream));
+    FMHA_HIP_CHECK(hipMemcpyAsync(
         p.host_seqstart_k.data(),
         seqstart_k->data_ptr(),
         (p.num_batches + 1) * sizeof(int),
-        hipMemcpyDeviceToHost));
-    if (seqlen_k.has_value())
-      FMHA_HIP_CHECK(hipMemcpy(
-          p.host_seqlen_k.data(),
-          seqlen_k->data_ptr(),
-          p.num_batches * sizeof(int),
-          hipMemcpyDeviceToHost));
+        hipMemcpyDeviceToHost,
+        stream));
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
@@ -294,11 +294,12 @@ efficient_attention_backward_ck(
 
       p.host_seqlen_k.resize(p.num_batches);
 
-      FMHA_HIP_CHECK(hipMemcpy(
+      FMHA_HIP_CHECK(hipMemcpyAsync(
           p.host_seqlen_k.data(),
           seqlen_k->data_ptr(),
           p.num_batches * sizeof(int32_t),
-          hipMemcpyDeviceToHost));
+          hipMemcpyDeviceToHost,
+          stream));
     }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
@@ -335,13 +336,17 @@ efficient_attention_backward_ck(
           randvals.scalar_type());
 
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_stride]));
-      p.grad_q_ptrs.push_back(reinterpret_cast<void*>(&grad_q_ptr[tmp_q_stride]));
+      p.grad_q_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_stride]));
       p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_stride]));
-      p.grad_k_ptrs.push_back(reinterpret_cast<void*>(&grad_k_ptr[tmp_k_stride]));
+      p.grad_k_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_k_ptr[tmp_k_stride]));
       p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_stride]));
-      p.grad_v_ptrs.push_back(reinterpret_cast<void*>(&grad_v_ptr[tmp_v_stride]));
+      p.grad_v_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_v_ptr[tmp_v_stride]));
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_stride]));
-      p.grad_out_ptrs.push_back(reinterpret_cast<void*>(&grad_out_ptr[tmp_grad_o_stride]));
+      p.grad_out_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_out_ptr[tmp_grad_o_stride]));
 
       if (bias.has_value()) {
         int32_t tmp_bias_stride = get_size_in_bytes(
@@ -349,11 +354,14 @@ efficient_attention_backward_ck(
                 p.host_seqstart_k[i] * p.attn_bias_strides[3],
             bias->scalar_type());
 
-        p.attn_bias_ptrs.push_back(reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_stride]));
+        p.attn_bias_ptrs.push_back(
+            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_stride]));
       };
 
-      p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_stride]));
-      p.randvals_ptrs.push_back(reinterpret_cast<void*>(&randvals_ptr[tmp_randvals_stride]));
+      p.logsumexp_ptrs.push_back(
+          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_stride]));
+      p.randvals_ptrs.push_back(
+          reinterpret_cast<void*>(&randvals_ptr[tmp_randvals_stride]));
     }
   };
 
@@ -385,7 +393,7 @@ efficient_attention_backward_ck(
 
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 #endif
-}
+} // namespace
 
 } // namespace
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index f6dd8e3d8..89786cccd 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -72,6 +72,11 @@ efficient_attention_forward_ck(
   TORCH_CHECK(query.scalar_type() == key.scalar_type());
   TORCH_CHECK(query.scalar_type() == value.scalar_type());
 
+  // Query, Key, Value must use the same CUDA device
+  TORCH_CHECK(query.device() == key.device());
+  TORCH_CHECK(query.device() == value.device());
+  TORCH_CHECK(query.device().type() == torch::kCUDA)
+
   TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
   if (seqstart_q.has_value()) {
     TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
@@ -87,7 +92,7 @@ efficient_attention_forward_ck(
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
 
-  // at::cuda::CUDAGuard device_guard(query.device());
+  at::cuda::CUDAGuard device_guard(query.device());
   hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
@@ -379,7 +384,7 @@ efficient_attention_forward_ck(
       } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
         batched_forward_bp16(batched_forward_params, stream);
       } else
-        throw std::runtime_error("input data-type is not supported");
+        throw std::runtime_error("input data-type is not supported!");
     } else { // input is grouped
       GroupedForwardParams grouped_forward_params;
 
@@ -390,7 +395,7 @@ efficient_attention_forward_ck(
       } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
         grouped_forward_bp16(grouped_forward_params, stream);
       } else
-        throw std::runtime_error("input data-type is not supported");
+        throw std::runtime_error("input data-type is not supported!");
     }
   });
 

From 82c365117d2792b5133f98355e6e34ab07b08f74 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 4 Sep 2023 23:36:16 +0000
Subject: [PATCH 043/641] Split file ck_fmha_util.h

---
 .../hip_fmha/attention_backward_generic.cpp   |   1 +
 .../hip_fmha/attention_ck_rand_uniform.cpp    |   5 +-
 .../hip_fmha/attention_forward_generic.cpp    |   1 +
 .../hip_fmha/ck_fmha_batched_backward.h       |   3 +-
 .../hip_fmha/ck_fmha_batched_forward.h        |   3 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       |   3 +-
 .../hip_fmha/ck_fmha_grouped_forward.h        |   3 +-
 .../attention/hip_fmha/ck_fmha_op_helper.h    |  41 ++++
 .../csrc/attention/hip_fmha/ck_fmha_params.h  | 200 +++++++++++++++++
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 202 ------------------
 10 files changed, 253 insertions(+), 209 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_params.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 0faf23be9..e82f0ef80 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -9,6 +9,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
 
+#include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
 extern void batched_backward_fp16(
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index b786b0837..5aab03568 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -10,14 +10,13 @@
 #include <torch/library.h>
 #include <torch/types.h>
 
-#include <sstream>
-#include <stdexcept>
-
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_batched_dropout.hpp"
 
+#include "ck_fmha_util.h"
+
 namespace {
 
 /**
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 89786cccd..87a45e158 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -11,6 +11,7 @@
 #include <c10/util/Optional.h>
 #include <torch/library.h>
 
+#include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
 extern void batched_forward_fp16(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 9c2466214..136a6b0aa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -9,7 +9,8 @@
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
-#include "ck_fmha_util.h"
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void batched_backward_masktype_attnbias_dispatched(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index b2daa90c2..f63c70dd5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -9,7 +9,8 @@
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp"
 
-#include "ck_fmha_util.h"
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void batched_forward_masktype_attnbias_dispatched(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 620ebf26c..161067616 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -10,7 +10,8 @@
 #include <ck/utility/sequence.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
-#include "ck_fmha_util.h"
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void grouped_backward_masktype_attnbias_dispatched(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 4f3d9a985..9c23e1676 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -10,7 +10,8 @@
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/sequence.hpp>
 
-#include "ck_fmha_util.h"
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void grouped_forward_masktype_attnbias_dispatched(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
new file mode 100644
index 000000000..ffc53514b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/torch.h>
+
+#include <ck/utility/data_type.hpp>
+
+template <typename scalar_t>
+struct MaxVectorSizeForType {
+  static constexpr int value = 4;
+};
+
+template <>
+struct MaxVectorSizeForType<ck::half_t> {
+  static constexpr int value = 8;
+};
+
+template <>
+struct MaxVectorSizeForType<ck::bhalf_t> {
+  static constexpr int value = 8;
+};
+
+struct SimpleDeviceMem {
+  SimpleDeviceMem() = delete;
+  SimpleDeviceMem(std::size_t mem_size) {
+    auto options = torch::TensorOptions();
+    mem = at::empty(
+        mem_size, options.dtype(at::ScalarType::Byte).device(torch::kCUDA));
+  }
+  void* GetDeviceBuffer() {
+    return mem.data_ptr();
+  }
+  ~SimpleDeviceMem() {}
+
+  at::Tensor mem;
+};
+
+// useful aliasing for making the codes easy
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
new file mode 100644
index 000000000..50c478c33
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+
+struct BatchedInferParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+
+  // BMHK mode strides
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> out_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* out_ptr;
+};
+
+struct BatchedForwardParams : public BatchedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
+
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // BHMN mode strides, completely contiguous
+  std::array<int32_t, 4> randvals_strides;
+  void* randvals_ptr;
+
+  // completely contiguous
+  void* logsumexp_ptr;
+};
+
+struct GroupedInferParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+  bool has_attn_bias;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<void*> out_ptrs;
+
+  uint8_t custom_mask_type;
+};
+
+struct GroupedForwardParams : public GroupedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
+
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // HMN mode strides, completely contiguous
+  std::array<int, 3> randvals_strides;
+  std::vector<void*> randvals_ptrs;
+
+  // completely contiguous
+  std::vector<void*> logsumexp_ptrs;
+};
+
+struct BatchedBackwardParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+
+  // BMHK mode strides, last-dim contiguous
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+  std::array<int, 4> out_strides;
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+  const void* out_ptr;
+
+  uint8_t custom_mask_type;
+
+  std::array<int, 4> grad_out_strides;
+
+  const void* grad_out_ptr;
+
+  void* grad_q_ptr;
+  void* grad_k_ptr;
+  void* grad_v_ptr;
+  // void* grad_bias_ptr;
+
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // completely contiguous
+  const void* logsumexp_ptr;
+
+  // BHMN mode strides, completely contiguous
+  std::array<int, 4> randvals_strides;
+  void* randvals_ptr;
+
+  int64_t rng_seed;
+  int64_t rng_offset;
+};
+
+struct GroupedBackwardParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int num_heads; //
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+  bool has_attn_bias;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<const void*> out_ptrs;
+
+  uint8_t custom_mask_type;
+
+  std::array<int, 3> grad_out_strides;
+
+  std::vector<const void*> grad_out_ptrs;
+
+  std::vector<void*> grad_q_ptrs;
+  std::vector<void*> grad_k_ptrs;
+  std::vector<void*> grad_v_ptrs;
+  // std::vector<void *> grad_bias_ptrs;
+
+  float dropout_prob;
+  at::PhiloxCudaState rng_engine_inputs;
+
+  // HM mode strides, completely contiguous
+  std::vector<const void*> logsumexp_ptrs;
+
+  // HMN mode strides, completely contiguous
+  std::array<int, 3> randvals_strides;
+  std::vector<void*> randvals_ptrs;
+
+  int64_t rng_seed;
+  int64_t rng_offset;
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 851c8dbda..9e4d0e5fa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <array>
 #include <cstdint>
 #include <iostream>
 #include <sstream>
@@ -8,7 +7,6 @@
 
 #include <torch/torch.h>
 
-#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ck/ck.hpp>
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
@@ -178,203 +176,3 @@ struct SimpleDeviceMem {
 
   at::Tensor mem;
 };
-
-struct BatchedInferParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int num_heads; //
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-
-  // BMHK mode strides
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> out_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* out_ptr;
-};
-
-struct BatchedForwardParams : public BatchedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
-
-  float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
-
-  // BHMN mode strides, completely contiguous
-  std::array<int32_t, 4> randvals_strides;
-  void* randvals_ptr;
-
-  // completely contiguous
-  void* logsumexp_ptr;
-};
-
-struct GroupedInferParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int num_heads; //
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<void*> out_ptrs;
-
-  uint8_t custom_mask_type;
-};
-
-struct GroupedForwardParams : public GroupedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
-
-  float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
-
-  // HMN mode strides, completely contiguous
-  std::array<int, 3> randvals_strides;
-  std::vector<void*> randvals_ptrs;
-
-  // completely contiguous
-  std::vector<void*> logsumexp_ptrs;
-};
-
-struct BatchedBackwardParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int num_heads; //
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-
-  // BMHK mode strides, last-dim contiguous
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-  std::array<int, 4> out_strides;
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-  const void* out_ptr;
-
-  uint8_t custom_mask_type;
-
-  std::array<int, 4> grad_out_strides;
-
-  const void* grad_out_ptr;
-
-  void* grad_q_ptr;
-  void* grad_k_ptr;
-  void* grad_v_ptr;
-  // void* grad_bias_ptr;
-
-  float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
-
-  // completely contiguous
-  const void* logsumexp_ptr;
-
-  // BHMN mode strides, completely contiguous
-  std::array<int, 4> randvals_strides;
-  void* randvals_ptr;
-
-  int64_t rng_seed;
-  int64_t rng_offset;
-};
-
-struct GroupedBackwardParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int num_heads; //
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<const void*> out_ptrs;
-
-  uint8_t custom_mask_type;
-
-  std::array<int, 3> grad_out_strides;
-
-  std::vector<const void*> grad_out_ptrs;
-
-  std::vector<void*> grad_q_ptrs;
-  std::vector<void*> grad_k_ptrs;
-  std::vector<void*> grad_v_ptrs;
-  // std::vector<void *> grad_bias_ptrs;
-
-  float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
-
-  // HM mode strides, completely contiguous
-  std::vector<const void*> logsumexp_ptrs;
-
-  // HMN mode strides, completely contiguous
-  std::array<int, 3> randvals_strides;
-  std::vector<void*> randvals_ptrs;
-
-  int64_t rng_seed;
-  int64_t rng_offset;
-};
-
-// useful aliasing for making the codes easy
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using F32 = float;

From 1299d4d63418d65407b057b9af2870e9fd8c53f3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 5 Sep 2023 15:31:48 +0000
Subject: [PATCH 044/641] Update and get the test_dropout passes all comparison
 tests

---
 .gitignore                                    |  7 +++++
 tests/test_mem_eff_attention_ck.py            | 24 ++++++++-------
 .../hip_fmha/attention_forward_generic.cpp    |  2 ++
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 29 -------------------
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 38b453363..56869b496 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,10 @@ outputs
 xformers/_flash_attn
 xformers/version.py
 xformers/cpp_lib.json
+
+## temporary files
+xformers/csrc/attention/hip_fmha/*.cu
+xformers/csrc/attention/hip_fmha/*.hip
+xformers/csrc/attention/hip_fmha/*_hip.h
+
+
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 8a44de2d8..bbede9f2b 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -882,8 +882,7 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
         ## rand_uniform is an int32 tensor
         rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
-        mask = (rand_uniform > int(p*65535)).to(torch.float32)
-        print("call _ck_rand_uniform passed")
+        mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
         mask = mask.reshape(batch_size, q_len, kv_len)
     else:
         mask = torch.empty((batch_size, q_len, kv_len), device=device)
@@ -900,14 +899,15 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
 @pytest.mark.parametrize("q_len", [2, 33])
-@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
-def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_dropout(dtype, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     device = "cuda"
-    scale = 3
-    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
-    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-
+    scale = 0.05 
+    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+    op = fmha.ck.FwOp
+  
     inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
     if not op.supports(inputs_for_support_check):
         del query, key, value, attn_bias
@@ -928,8 +928,10 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     torch.manual_seed(seed)
     mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
     ref = ref_attention(query, key, value, attn_bias, mask, p)
-    assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
+    assert_allclose(out.float(), ref, atol=3e-3, rtol=5e-4), f"{(out - ref).abs().max()}"
 
+    ## CK generated random numbers failed with the binomtest
+    '''
     num_trials = 1000
     p_val_tol = 1e-6
     keep_prob = 1 - p
@@ -943,7 +945,7 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     masks = masks.sum(0).flatten()
     p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
     assert all(p_values > p_val_tol)
-
+    '''
 
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     if dtype is torch.bfloat16 and compute_capability < (8, 0):
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 87a45e158..1653c9a3f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -400,6 +400,8 @@ efficient_attention_forward_ck(
     }
   });
 
+  // torch::save(randvals, "randvals_dev.zip"); 
+
   std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
   std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 9e4d0e5fa..345914716 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -147,32 +147,3 @@ inline at::Tensor get_bias_4d_view(
   }
 }
 
-template <typename scalar_t>
-struct MaxVectorSizeForType {
-  static constexpr int value = 4;
-};
-
-template <>
-struct MaxVectorSizeForType<ck::half_t> {
-  static constexpr int value = 8;
-};
-
-template <>
-struct MaxVectorSizeForType<ck::bhalf_t> {
-  static constexpr int value = 8;
-};
-
-struct SimpleDeviceMem {
-  SimpleDeviceMem() = delete;
-  SimpleDeviceMem(std::size_t mem_size) {
-    auto options = torch::TensorOptions();
-    mem = at::empty(
-        mem_size, options.dtype(at::ScalarType::Byte).device(torch::kCUDA));
-  }
-  void* GetDeviceBuffer() {
-    return mem.data_ptr();
-  }
-  ~SimpleDeviceMem() {}
-
-  at::Tensor mem;
-};

From 6af177bd9249832510366f8f222b32c970151846 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 5 Sep 2023 17:24:52 +0000
Subject: [PATCH 045/641] Use CUDAGenerator to get PhiloxCudaState for {seed,
 offset}

---
 .../benchmark_mem_eff_attn_decoder_ck.py      | 186 ++++++++++++++++++
 .../hip_fmha/attention_backward_generic.cpp   |  13 +-
 .../hip_fmha/attention_ck_rand_uniform.cpp    |  20 +-
 .../hip_fmha/attention_forward_generic.cpp    |  29 +--
 .../hip_fmha/ck_fmha_batched_backward.h       |   3 +-
 .../hip_fmha/ck_fmha_batched_forward.h        |   9 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       |   3 +-
 .../hip_fmha/ck_fmha_grouped_forward.h        |   6 +-
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  20 +-
 9 files changed, 239 insertions(+), 50 deletions(-)
 create mode 100644 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
new file mode 100644
index 000000000..0e81d2a7a
--- /dev/null
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -0,0 +1,186 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import itertools
+from functools import partial
+
+import torch
+from torch.utils import benchmark
+from utils import benchmark_main_helper
+
+import xformers.ops
+import xformers.ops.fmha as fmha
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+# Run with
+#  python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py --omit-baselines --quiet
+# The baselines for these benchmarks are really slow because there is
+# so much padding in the inputs, so there is no point running them.
+
+
+def ref_attention_bmk(q, k, v, attn_bias=None):
+    if isinstance(attn_bias, xformers.ops.AttentionMask):
+        attn_bias = (
+            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
+            .to(q)
+            .squeeze()
+        )
+    q = q * (1.0 / q.shape[-1] ** 0.5)
+    if attn_bias is None:
+        attn = q @ k.transpose(-2, -1)
+    else:
+        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
+        # but faster, and is what is used in PyTorch now
+        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
+    attn = attn.softmax(-1)
+    return attn @ v
+
+
+def ref_attention(q, k, v, attn_bias):
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+min_run_time = 0.5
+device = torch.device("cuda")
+
+NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
+
+OPS = [
+    xformers.ops.fmha.ck.FwOp,
+]
+
+KV_SHAPES = [
+    # list of n_keys, padding_length, batchsize
+    (2, 64, 3),
+    (32, 1024, 500),
+    (1000, 1024, 2),
+    (8000, 8192, 1),
+    (240, 256, 32),
+    (2048, 2 * 1024, 4),
+    (4096 * 2, 8 * 1024, 1),
+]
+
+N_HEADS = [8, 16, 64]
+
+
+def product_dict(**kwargs):
+    keys = kwargs.keys()
+    vals = kwargs.values()
+    for instance in itertools.product(*vals):
+        yield dict(zip(keys, instance))
+
+
+CASES = list(
+    product_dict(
+        kv_shape=KV_SHAPES,
+        n_heads=N_HEADS,
+        num_threads=NUM_THREADS,
+        multiquery=[True, False],
+    )
+)
+
+
+def mem_eff_attention_decoder(
+    kv_shape, n_heads: int, num_threads: int, multiquery: bool
+):
+    n_keys, padding, B = kv_shape
+    torch.manual_seed(42)
+    k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
+    K = 128
+
+    q = torch.rand(1, B, n_heads, K, device=device, dtype=torch.bfloat16)
+    if multiquery:
+        k = torch.rand(
+            1, B * padding, 1, K, device=device, dtype=torch.bfloat16
+        ).expand(1, B * padding, n_heads, K)
+        v = torch.rand(
+            1, B * padding, 1, K, device=device, dtype=torch.bfloat16
+        ).expand(1, B * padding, n_heads, K)
+    else:
+        k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=torch.bfloat16)
+        v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=torch.bfloat16)
+
+    bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=[1] * B,
+        kv_seqlen=k_seqlen,
+        kv_padding=padding,
+    )
+
+    sub_label = f"{B}batch-{k_seqlen[0]}keys-{n_heads}heads"
+    if multiquery:
+        sub_label += "-mq"
+
+    has_run = False
+    for fw_op in OPS:
+        inp = fmha.Inputs(q, k, v, attn_bias=bias)
+        if not fw_op.supports(inp):
+            continue
+
+        fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
+
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": bias,
+                "fn": fn,
+            },
+            label="attention",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            fn(q, k, v, bias)
+        yield benchmark.Timer(
+            stmt="graph.replay()",
+            globals={
+                "graph": graph,
+            },
+            label="cuda graphed attention",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+
+        has_run = True
+
+    if not has_run:
+        return
+
+    RUN_BASELINES = False
+    if RUN_BASELINES:
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": bias,
+                "fn": ref_attention,
+            },
+            label="attention",
+            description="eager",
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+
+
+benchmark_main_helper(mem_eff_attention_decoder, CASES, min_run_time=min_run_time)
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index e82f0ef80..c16e7725d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -4,8 +4,6 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/Tensor.h>
 #include <ATen/TensorOperators.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
 
@@ -125,8 +123,6 @@ efficient_attention_backward_ck(
 
   at::Tensor randvals;
 
-  at::PhiloxCudaState rng_engine_inputs(rng_seed, rng_offset);
-
   auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
     p.B = B;
     p.M = M;
@@ -191,7 +187,8 @@ efficient_attention_backward_ck(
     p.custom_mask_type = custom_mask_type;
 
     p.dropout_prob = static_cast<float>(dropout_p);
-    p.rng_engine_inputs = rng_engine_inputs;
+    p.philox_seed = rng_seed;
+    p.philox_offset = rng_offset;
 
     randvals = at::empty(
         {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
@@ -203,9 +200,6 @@ efficient_attention_backward_ck(
     p.randvals_ptr = randvals.data_ptr();
 
     p.logsumexp_ptr = logsumexp.data_ptr();
-
-    p.rng_seed = rng_seed;
-    p.rng_offset = rng_offset;
   };
 
   auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
@@ -260,7 +254,8 @@ efficient_attention_backward_ck(
       p.has_attn_bias = false;
 
     p.dropout_prob = static_cast<float>(dropout_p);
-    p.rng_engine_inputs = rng_engine_inputs;
+    p.philox_seed = rng_seed;
+    p.philox_offset = rng_offset;
 
     randvals = at::empty(
         {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index 5aab03568..bf45f579a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -6,9 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/core/TensorOptions.h>
 #include <torch/library.h>
 #include <torch/types.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
@@ -32,6 +35,21 @@ at::Tensor rand_uniform_int(
   int M = out_pattern.size(2);
   int N = out_pattern.size(3);
 
+  at::CUDAGeneratorImpl* gen =
+      at::get_generator_or_default<at::CUDAGeneratorImpl>(
+          c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+  at::PhiloxCudaState rng_engine_inputs;
+  {
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+  }
+
+  const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+  int64_t philox_seed = std::get<0>(seeds);
+  int64_t philox_offset = std::get<1>(seeds);
+
   at::Tensor randvals;
 
   randvals = at::empty(
@@ -87,7 +105,7 @@ at::Tensor rand_uniform_int(
       static_cast<int*>(randvals.data_ptr()),
       z_gs_ms_ns_lengths,
       z_gs_ms_ns_strides,
-      {seed, offset});
+      {philox_seed, philox_offset});
 
   dropout_invoker.Run(dropout_arg, StreamConfig{nullptr, false});
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 1653c9a3f..665eb44f4 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -10,6 +10,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Optional.h>
 #include <torch/library.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
@@ -108,8 +109,11 @@ efficient_attention_forward_ck(
   at::Tensor randvals;
 
   const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-  at::PhiloxCudaState rng_engine_inputs;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
   if (use_dropout) {
+    at::PhiloxCudaState rng_engine_inputs;
     at::CUDAGeneratorImpl* gen =
         at::get_generator_or_default<at::CUDAGeneratorImpl>(
             c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
@@ -118,6 +122,11 @@ efficient_attention_forward_ck(
     // if using dropout, we produce 1 random number for each element of the
     // attention tensor
     rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+
+    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+    philox_seed = std::get<0>(seeds);
+    philox_offset = std::get<1>(seeds);
   }
 
   auto set_batched_forward_params = [&](BatchedForwardParams& p) {
@@ -180,14 +189,14 @@ efficient_attention_forward_ck(
     p.custom_mask_type = custom_mask_type;
 
     p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
     p.compute_logsumexp = compute_logsumexp;
 
     // the following parameters are only used by training forward
     if (p.use_dropout) {
       p.dropout_prob = static_cast<float>(dropout_p);
 
-      p.rng_engine_inputs = rng_engine_inputs;
-
       randvals = at::empty(
           {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
       p.randvals_strides = {
@@ -324,12 +333,13 @@ efficient_attention_forward_ck(
     }
 
     p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
     p.compute_logsumexp = compute_logsumexp;
 
     // the following parameters are only used by training forward
     if (p.use_dropout) {
       p.dropout_prob = static_cast<float>(dropout_p);
-      p.rng_engine_inputs = rng_engine_inputs;
 
       randvals = at::empty(
           {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
@@ -366,10 +376,6 @@ efficient_attention_forward_ck(
     };
   };
 
-  // uint64_t -> int64_t bitwise casting as PyTorch don't support uint64_t
-  // so just fake it as a int64_t
-  int64_t seed, offset;
-
   DISPATCH_TYPES(query.scalar_type(), [&]() {
     out = at::zeros(
         {B, M, num_heads, Kv},
@@ -400,12 +406,9 @@ efficient_attention_forward_ck(
     }
   });
 
-  // torch::save(randvals, "randvals_dev.zip"); 
-
-  std::memcpy(&seed, &rng_engine_inputs.seed_, sizeof(seed));
-  std::memcpy(&offset, &rng_engine_inputs.offset_.val, sizeof(offset));
+  // torch::save(randvals, "randvals_dev.zip");
 
-  return std::make_tuple(out, logsumexp, seed, offset);
+  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
 } // namespace
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 136a6b0aa..0a7d1fcfe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -233,8 +233,7 @@ void batched_backward_masktype_attnbias_dispatched(
       QKVElementOp{},
       YElementOp{},
       param.dropout_prob,
-      std::tuple<unsigned long long, unsigned long long>(
-          param.rng_seed, param.rng_offset));
+      std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
 
   SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index f63c70dd5..f5b5dd8d9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -214,10 +214,6 @@ void batched_forward_masktype_attnbias_dispatched(
   auto b1_element_op = B1ElementOp{};
   auto c_element_op = CElementOp{};
 
-  // TODO, how to initialize seed, offset
-  const uint64_t seed = 1;
-  const uint64_t offset = 0;
-
   auto op = DeviceOpInstance{};
   auto invoker = op.MakeInvoker();
 
@@ -251,8 +247,9 @@ void batched_forward_masktype_attnbias_dispatched(
       b1_element_op,
       c_element_op,
       param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-      {seed, offset}); // dropout random seed and offset, offset should be at
-                       // least the number of elements on a thread
+      std::tuple<int64_t, int64_t>(
+          param.philox_seed,
+          param.philox_offset)); // dropout random seed and offset
 
   SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 161067616..c7c1602ae 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -234,8 +234,7 @@ void grouped_backward_masktype_attnbias_dispatched(
       QKVElementOp{},
       YElementOp{},
       param.dropout_prob,
-      std::tuple<unsigned long long, unsigned long long>(
-          param.rng_seed, param.rng_offset));
+      std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
 
   SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 9c23e1676..4a29ad39b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -215,10 +215,6 @@ void grouped_forward_masktype_attnbias_dispatched(
          {}}); // acc1_bias_gs_ms_os_strides
   }
 
-  // TODO, how to initialize seed, offset
-  const uint64_t seed = 1;
-  const uint64_t offset = 0;
-
   float alpha = param.scale;
 
   auto a_element_op = AElementOp{};
@@ -246,7 +242,7 @@ void grouped_forward_masktype_attnbias_dispatched(
       b1_element_op,
       c_element_op,
       param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-      {seed, offset});
+      std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
 
   SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 50c478c33..b48f6fa8f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -3,8 +3,6 @@
 #include <array>
 #include <cstdint>
 
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-
 struct BatchedInferParams {
   int B; // batch size
   int M; // seq_len for Query
@@ -38,7 +36,8 @@ struct BatchedForwardParams : public BatchedInferParams {
   bool compute_logsumexp;
 
   float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
   // BHMN mode strides, completely contiguous
   std::array<int32_t, 4> randvals_strides;
@@ -86,7 +85,8 @@ struct GroupedForwardParams : public GroupedInferParams {
   bool compute_logsumexp;
 
   float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
   // HMN mode strides, completely contiguous
   std::array<int, 3> randvals_strides;
@@ -132,7 +132,8 @@ struct BatchedBackwardParams {
   // void* grad_bias_ptr;
 
   float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
   // completely contiguous
   const void* logsumexp_ptr;
@@ -140,9 +141,6 @@ struct BatchedBackwardParams {
   // BHMN mode strides, completely contiguous
   std::array<int, 4> randvals_strides;
   void* randvals_ptr;
-
-  int64_t rng_seed;
-  int64_t rng_offset;
 };
 
 struct GroupedBackwardParams {
@@ -186,7 +184,8 @@ struct GroupedBackwardParams {
   // std::vector<void *> grad_bias_ptrs;
 
   float dropout_prob;
-  at::PhiloxCudaState rng_engine_inputs;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
   // HM mode strides, completely contiguous
   std::vector<const void*> logsumexp_ptrs;
@@ -194,7 +193,4 @@ struct GroupedBackwardParams {
   // HMN mode strides, completely contiguous
   std::array<int, 3> randvals_strides;
   std::vector<void*> randvals_ptrs;
-
-  int64_t rng_seed;
-  int64_t rng_offset;
 };

From e72bf95d3bcc8d4d429ee2f6cd7da7e31bb71049 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 5 Sep 2023 17:55:45 +0000
Subject: [PATCH 046/641] Update to test_mem_eff_attention_ck.py and
 readme_test_on_rocm.txt with test_dropout completely passed

---
 tests/readme_test_on_rocm.txt      | 6 +++++-
 tests/test_mem_eff_attention_ck.py | 7 ++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index 392a2a427..16e283ccb 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -3,7 +3,7 @@
 
    2. verify testing for memory_efficient_attention inference 
 
-      pytest -k test_forward tests/test_mem_eff_attention_ck.py
+      pytest tests/test_mem_eff_attention_ck.py::test_forward
 
    3. The following tests in tests/memory_eff_attention_ck.py have passed
 
@@ -25,5 +25,9 @@
       * test_unsupported_stride_lastdim
       * test_unsupported_stride_alignment
       * test_cuda_streams
+      * test_dropout
 
+   4. verify testing for memory_efficient_attention forward (with dropout)
+
+      pytest tests/test_mem_eff_attention_ck.py::test_dropout
 
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index bbede9f2b..e655e3a84 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -899,14 +899,14 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
 @pytest.mark.parametrize("q_len", [2, 33])
+@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
-def test_dropout(dtype, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
+def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     device = "cuda"
     scale = 0.05 
     query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
     key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
     value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-    op = fmha.ck.FwOp
   
     inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
     if not op.supports(inputs_for_support_check):
@@ -930,8 +930,6 @@ def test_dropout(dtype, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     ref = ref_attention(query, key, value, attn_bias, mask, p)
     assert_allclose(out.float(), ref, atol=3e-3, rtol=5e-4), f"{(out - ref).abs().max()}"
 
-    ## CK generated random numbers failed with the binomtest
-    '''
     num_trials = 1000
     p_val_tol = 1e-6
     keep_prob = 1 - p
@@ -945,7 +943,6 @@ def test_dropout(dtype, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     masks = masks.sum(0).flatten()
     p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
     assert all(p_values > p_val_tol)
-    '''
 
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     if dtype is torch.bfloat16 and compute_capability < (8, 0):

From cf04a8adf0a455b28503350754d62686ac85efa7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 5 Sep 2023 19:45:20 +0000
Subject: [PATCH 047/641] Fix in xformers/benchmarks/utils.py for file naming
 in ROCM

---
 xformers/benchmarks/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xformers/benchmarks/utils.py b/xformers/benchmarks/utils.py
index a3d10d63d..0a722846b 100644
--- a/xformers/benchmarks/utils.py
+++ b/xformers/benchmarks/utils.py
@@ -470,6 +470,7 @@ def benchmark_run_and_compare(
             .replace(" ", "_")
             .replace("-", "_")
             .replace(".", "_")
+            .replace("/", "_")
         )
     except (RuntimeError, AssertionError):  # No GPU
         env = "cpu"

From 7a3d169649332c6cede691ce818074e33840abe1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Sep 2023 16:59:39 +0000
Subject: [PATCH 048/641] Remove one shape case from
 benchmark_mem_eff_attn_decoder_ck.py due to too big memory requirement

---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index 0e81d2a7a..c700109e9 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -65,7 +65,7 @@ def T(t):
 KV_SHAPES = [
     # list of n_keys, padding_length, batchsize
     (2, 64, 3),
-    (32, 1024, 500),
+    ##(32, 1024, 500),  // this one fails due to consuming too much GPU memory
     (1000, 1024, 2),
     (8000, 8192, 1),
     (240, 256, 32),

From 59ae73fe1f45a967fe2dce36150c2f8f78a47f6c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 10 Sep 2023 11:52:43 +0000
Subject: [PATCH 049/641] Remove the using of hipMemcpyAsync in C++ extension

---
 .../hip_fmha/attention_backward_generic.cpp   | 39 ++++++++-----------
 .../hip_fmha/attention_forward_generic.cpp    | 39 ++++++++-----------
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 13 -------
 3 files changed, 32 insertions(+), 59 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index c16e7725d..a86b68330 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -78,14 +78,10 @@ efficient_attention_backward_ck(
   TORCH_CHECK(query.size(3) == key.size(3));
   TORCH_CHECK(value.size(3) == grad_out.size(3));
 
-  // Query, Key, Value must use the same CUDA device
-  TORCH_CHECK(query.device() == key.device());
-  TORCH_CHECK(query.device() == value.device());
-  TORCH_CHECK(query.device().type() == torch::kCUDA)
-
   // handle potentially non-contiguous grad_out through a copy
   CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
 
+  // last dim is contiguous, device is CUDA
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
@@ -269,18 +265,16 @@ efficient_attention_backward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    FMHA_HIP_CHECK(hipMemcpyAsync(
-        p.host_seqstart_q.data(),
-        seqstart_q->data_ptr(),
-        (p.num_batches + 1) * sizeof(int),
-        hipMemcpyDeviceToHost,
-        stream));
-    FMHA_HIP_CHECK(hipMemcpyAsync(
-        p.host_seqstart_k.data(),
-        seqstart_k->data_ptr(),
-        (p.num_batches + 1) * sizeof(int),
-        hipMemcpyDeviceToHost,
-        stream));
+    auto seqstart_q_cpu = seqstart_q->to(at::kCPU);
+    auto seqstart_k_cpu = seqstart_k->to(at::kCPU);
+
+    for (int i = 0; i < p.host_seqstart_q.size(); i++)
+      p.host_seqstart_q[i] =
+          *(reinterpret_cast<int*>(seqstart_q_cpu.data_ptr()) + i);
+
+    for (int i = 0; i < p.host_seqstart_k.size(); i++)
+      p.host_seqstart_k[i] =
+          *(reinterpret_cast<int*>(seqstart_k_cpu.data_ptr()) + i);
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
@@ -290,12 +284,11 @@ efficient_attention_backward_ck(
 
       p.host_seqlen_k.resize(p.num_batches);
 
-      FMHA_HIP_CHECK(hipMemcpyAsync(
-          p.host_seqlen_k.data(),
-          seqlen_k->data_ptr(),
-          p.num_batches * sizeof(int32_t),
-          hipMemcpyDeviceToHost,
-          stream));
+      auto seqlen_k_cpu = seqlen_k->to(at::kCPU);
+
+      for (int i = 0; i < p.host_seqlen_k.size(); i++)
+        p.host_seqlen_k[i] =
+            *(reinterpret_cast<int*>(seqlen_k_cpu.data_ptr()) + i);
     }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 665eb44f4..15cd39672 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -74,11 +74,6 @@ efficient_attention_forward_ck(
   TORCH_CHECK(query.scalar_type() == key.scalar_type());
   TORCH_CHECK(query.scalar_type() == value.scalar_type());
 
-  // Query, Key, Value must use the same CUDA device
-  TORCH_CHECK(query.device() == key.device());
-  TORCH_CHECK(query.device() == value.device());
-  TORCH_CHECK(query.device().type() == torch::kCUDA)
-
   TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
   if (seqstart_q.has_value()) {
     TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
@@ -90,6 +85,7 @@ efficient_attention_forward_ck(
     TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
   };
 
+  // last dim is contiguous, device is kCUDA
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
@@ -269,18 +265,16 @@ efficient_attention_forward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    FMHA_HIP_CHECK(hipMemcpyAsync(
-        p.host_seqstart_q.data(),
-        seqstart_q->data_ptr(),
-        (p.num_batches + 1) * sizeof(int32_t),
-        hipMemcpyDeviceToHost,
-        stream));
-    FMHA_HIP_CHECK(hipMemcpyAsync(
-        p.host_seqstart_k.data(),
-        seqstart_k->data_ptr(),
-        (p.num_batches + 1) * sizeof(int32_t),
-        hipMemcpyDeviceToHost,
-        stream));
+    auto seqstart_q_cpu = seqstart_q->to(at::kCPU);
+    auto seqstart_k_cpu = seqstart_k->to(at::kCPU);
+
+    for (int i = 0; i < p.host_seqstart_q.size(); i++)
+      p.host_seqstart_q[i] =
+          *(reinterpret_cast<int*>(seqstart_q_cpu.data_ptr()) + i);
+
+    for (int i = 0; i < p.host_seqstart_k.size(); i++)
+      p.host_seqstart_k[i] =
+          *(reinterpret_cast<int*>(seqstart_k_cpu.data_ptr()) + i);
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
@@ -290,12 +284,11 @@ efficient_attention_forward_ck(
 
       p.host_seqlen_k.resize(p.num_batches);
 
-      FMHA_HIP_CHECK(hipMemcpyAsync(
-          p.host_seqlen_k.data(),
-          seqlen_k->data_ptr(),
-          p.num_batches * sizeof(int32_t),
-          hipMemcpyDeviceToHost,
-          stream));
+      auto seqlen_k_cpu = seqlen_k->to(at::kCPU);
+
+      for (int i = 0; i < p.host_seqlen_k.size(); i++)
+        p.host_seqlen_k[i] =
+            *(reinterpret_cast<int*>(seqlen_k_cpu.data_ptr()) + i);
     }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 345914716..36465e34c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -11,18 +11,6 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
 
-// Here flag can be a constant, variable or function call
-#define FMHA_HIP_CHECK(ret_or_call)                                          \
-  do {                                                                       \
-    hipError_t _tmpVal;                                                      \
-    if ((_tmpVal = ret_or_call) != hipSuccess) {                             \
-      std::ostringstream ostr;                                               \
-      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
-           << hipGetErrorString(_tmpVal);                                    \
-      throw std::runtime_error(ostr.str());                                  \
-    }                                                                        \
-  } while (0)
-
 #define XFORMERS_CHECK(COND, ERR)          \
   if (!(COND)) {                           \
     std::ostringstream ostr;               \
@@ -146,4 +134,3 @@ inline at::Tensor get_bias_4d_view(
       TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
   }
 }
-

From c59c10d852664b645c2129267b7fbda99f9dbcb6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Sep 2023 15:36:37 +0000
Subject: [PATCH 050/641] Add hipStreamSynchronize

---
 .../csrc/attention/hip_fmha/attention_backward_generic.cpp  | 2 +-
 .../csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp   | 6 +++++-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h | 1 +
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h  | 2 ++
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h | 1 +
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h  | 1 +
 6 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index a86b68330..ab8114e29 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -102,7 +102,7 @@ efficient_attention_backward_ck(
   }
 
   at::cuda::CUDAGuard device_guard(query.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
   int64_t M = query.size(1);
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index bf45f579a..17aed503e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -35,6 +35,9 @@ at::Tensor rand_uniform_int(
   int M = out_pattern.size(2);
   int N = out_pattern.size(3);
 
+  at::cuda::CUDAGuard device_guard(out_pattern.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
   at::CUDAGeneratorImpl* gen =
       at::get_generator_or_default<at::CUDAGeneratorImpl>(
           c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
@@ -107,7 +110,8 @@ at::Tensor rand_uniform_int(
       z_gs_ms_ns_strides,
       {philox_seed, philox_offset});
 
-  dropout_invoker.Run(dropout_arg, StreamConfig{nullptr, false});
+  dropout_invoker.Run(dropout_arg, StreamConfig{stream, false});
+  (void)hipStreamSynchronize(stream);
 
   return randvals;
 } // namespace
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 0a7d1fcfe..bf9303f75 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -247,4 +247,5 @@ void batched_backward_masktype_attnbias_dispatched(
   }
 
   (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  (void)hipStreamSynchronize(stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index f5b5dd8d9..154e2027b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -264,4 +264,6 @@ void batched_forward_masktype_attnbias_dispatched(
   }
 
   invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+
+  (void)hipStreamSynchronize(stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index c7c1602ae..d0b10c80b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -248,4 +248,5 @@ void grouped_backward_masktype_attnbias_dispatched(
   }
 
   (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  (void)hipStreamSynchronize(stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 4a29ad39b..6c96673e5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -257,4 +257,5 @@ void grouped_forward_masktype_attnbias_dispatched(
   }
 
   (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  (void)hipStreamSynchronize(stream);
 };

From 71d3dc4aba5238eb9071a3b8782f64d5e60b97d4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Sep 2023 15:45:53 +0000
Subject: [PATCH 051/641] Update in C++ backward extension due to the change in
 CK FlashAttention backward

---
 third_party/composable_kernel                               | 2 +-
 third_party/flash-attention                                 | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h | 1 +
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 4c8b47c04..172835a5f 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 4c8b47c04d8fe9d3e7074bf207590eee833fa51f
+Subproject commit 172835a5f75ca5be7d0630fea7290e52b5f106a2
diff --git a/third_party/flash-attention b/third_party/flash-attention
index 9e5e8bc91..eff9fe6b8 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit 9e5e8bc91e30af5cdc321362b553f6c0da332e30
+Subproject commit eff9fe6b8076df59d64d7a3f464696738a3c7c24
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index bf9303f75..04cce9ddb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -96,6 +96,7 @@ void batched_backward_masktype_attnbias_dispatched(
           128, // KPerBlock
           128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
+          64, // Gemm2KPerBlock
           8, // AK1
           8, // BK1
           2, // A1K1
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index d0b10c80b..a7c268ceb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -97,6 +97,7 @@ void grouped_backward_masktype_attnbias_dispatched(
           128, // KPerBlock
           128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
+          64, // Gemm2KPerBlock
           8, // AK1
           8, // BK1
           2, // B1K1

From 0bb2dd929552bc9e71456bc76a88721dc742c48d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Sep 2023 18:27:00 +0000
Subject: [PATCH 052/641] Update to use uint8 random number generating in
 CK-FlashAttn

---
 tests/test_mem_eff_attention_ck.py | 3 ++-
 third_party/composable_kernel      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index e655e3a84..49ab783c0 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -882,7 +882,8 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
         ## rand_uniform is an int32 tensor
         rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
-        mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
+        ##mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
+        mask = (rand_uniform <= int((1.0-p)*255.0)).to(torch.float32)
         mask = mask.reshape(batch_size, q_len, kv_len)
     else:
         mask = torch.empty((batch_size, q_len, kv_len), device=device)
diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 172835a5f..12dcba200 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 172835a5f75ca5be7d0630fea7290e52b5f106a2
+Subproject commit 12dcba200a082ae40a0fb5aca3f093f1cc3470c7

From d16fd612274ceda387590cbd1ce6acdafaeaa196 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 12 Sep 2023 22:47:07 +0800
Subject: [PATCH 053/641] add ck into dispatch

---
 xformers/ops/fmha/dispatch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 3ed6dd1cb..7bcdcbabb 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -8,7 +8,7 @@
 from collections import deque
 from typing import List, Sequence, Type, TypeVar
 
-from . import cutlass, decoder, flash, small_k, triton
+from . import cutlass, decoder, flash, small_k, triton, ck
 from .common import AttentionBwOpBase, AttentionFwOpBase, Inputs
 
 
@@ -78,6 +78,7 @@ def _dispatch_fw(inp: Inputs, needs_gradient: bool) -> Type[AttentionFwOpBase]:
             flash.FwOp,
             triton.FwOp,
             cutlass.FwOp,
+            ck.FwOp,
             small_k.FwOp,
         ]
     )

From 07b889c4161f3b3ff0c26a8cb123b7d5135df36f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 13 Sep 2023 00:13:51 +0800
Subject: [PATCH 054/641] add available condition

---
 xformers/ops/fmha/dispatch.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 7bcdcbabb..1376e6766 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-
+import torch
 import textwrap
 from collections import deque
 from typing import List, Sequence, Type, TypeVar
@@ -74,13 +74,13 @@ def _dispatch_fw(inp: Inputs, needs_gradient: bool) -> Type[AttentionFwOpBase]:
     """
 
     priority_list_ops = deque(
-        [
+        [op for op in [
             flash.FwOp,
             triton.FwOp,
-            cutlass.FwOp,
             ck.FwOp,
+            cutlass.FwOp,
             small_k.FwOp,
-        ]
+        ] if op.is_available()]
     )
     if _is_cutlass_fwd_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.FwOp)
@@ -104,14 +104,15 @@ def _is_cutlassB_faster_than_flash(inp: Inputs) -> bool:
 
 
 def _dispatch_bw(inp: Inputs) -> Type[AttentionBwOpBase]:
-    priority_list_ops: List[Type[AttentionBwOpBase]] = [
+    priority_list_ops: List[Type[AttentionBwOpBase]] = [op for op in [
         flash.BwOp,
+        ck.BwOp,
         cutlass.BwOp,
         # CUDA illegal memory issues, race conditions etc..
         # triton.BwOp,
         # Deprecated
         small_k.BwOp,
-    ]
+    ] if op.is_available()]
     if _is_cutlassB_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.BwOp)
         priority_list_ops.insert(0, cutlass.BwOp)

From 6f54413d948f6975ed492b4aa71b74928b06a482 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Sep 2023 00:02:50 +0000
Subject: [PATCH 055/641] Add global workspace allocator to enable persistent
 workspace across CUDAGraph capturing

---
 .../hip_fmha/attention_backward_generic.cpp   |  2 +-
 .../hip_fmha/attention_ck_rand_uniform.cpp    |  2 +-
 .../hip_fmha/attention_forward_generic.cpp    | 24 +++++++++-
 .../ck_fmha_global_workspace_allocator.cpp    | 44 +++++++++++++++++++
 .../ck_fmha_global_workspace_allocator.h      | 31 +++++++++++++
 .../hip_fmha/ck_fmha_grouped_forward.h        | 10 ++++-
 .../attention/hip_fmha/ck_fmha_op_helper.h    | 20 +++++----
 7 files changed, 119 insertions(+), 14 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index ab8114e29..c75027705 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -101,7 +101,7 @@ efficient_attention_backward_ck(
     TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
   }
 
-  at::cuda::CUDAGuard device_guard(query.device());
+  // at::cuda::CUDAGuard device_guard(query.device());
   hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index 17aed503e..ecf73c09b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -35,7 +35,7 @@ at::Tensor rand_uniform_int(
   int M = out_pattern.size(2);
   int N = out_pattern.size(3);
 
-  at::cuda::CUDAGuard device_guard(out_pattern.device());
+  // at::cuda::CUDAGuard device_guard(out_pattern.device());
   hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   at::CUDAGeneratorImpl* gen =
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 15cd39672..eb4263536 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -90,7 +90,7 @@ efficient_attention_forward_ck(
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
 
-  at::cuda::CUDAGuard device_guard(query.device());
+  // at::cuda::CUDAGuard device_guard(query.device());
   hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
@@ -100,6 +100,22 @@ efficient_attention_forward_ck(
   int64_t K = query.size(-1);
   int64_t Kv = value.size(-1);
 
+  fprintf(
+      stdout,
+      "query data pointer %p, size %lx\n",
+      query.data_ptr(),
+      at::numel(query));
+  fprintf(
+      stdout,
+      "key data pointer %p, size %lx\n",
+      key.data_ptr(),
+      at::numel(key));
+  fprintf(
+      stdout,
+      "value data pointer %p, size %lx\n",
+      value.data_ptr(),
+      at::numel(value));
+
   at::Tensor out;
   at::Tensor logsumexp;
   at::Tensor randvals;
@@ -169,6 +185,8 @@ efficient_attention_forward_ck(
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
+      fprintf(stdout, "bias is not empty!\n");
+
       p.has_attn_bias = true;
       p.attn_bias_ptr = bias->data_ptr();
 
@@ -249,6 +267,8 @@ efficient_attention_forward_ck(
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
+      fprintf(stdout, "bias is not empty!\n");
+
       p.has_attn_bias = true;
       const at::Tensor bias_4d_view =
           get_bias_4d_view(*bias, B, num_heads, M, N);
@@ -370,7 +390,7 @@ efficient_attention_forward_ck(
   };
 
   DISPATCH_TYPES(query.scalar_type(), [&]() {
-    out = at::zeros(
+    out = at::empty(
         {B, M, num_heads, Kv},
         query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp
new file mode 100644
index 000000000..0382aa24b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp
@@ -0,0 +1,44 @@
+#include "ck_fmha_global_workspace_allocator.h"
+
+GlobalWorkspace::GlobalWorkspace(){};
+
+void* GlobalWorkspace::allocate(size_t sizeInBytes, hipStream_t stream) {
+  std::lock_guard<std::mutex> lck(mtx_);
+
+  auto it = buffers_.find(stream);
+
+  if (it != buffers_.end()) {
+    size_t curr_size = it->second.first;
+
+    // if requested size is bigger than existing buffer, allocate a bigger
+    // buffer; else re-use the existing buffer
+    if (curr_size < sizeInBytes) {
+      c10::cuda::HIPCachingAllocator::raw_delete(it->second.second);
+
+      void* new_buf = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
+      it->second.first = sizeInBytes;
+      it->second.second = new_buf;
+
+      return new_buf;
+    } else
+      return it->second.second;
+  } else {
+    // allocate a buffer and keep it for the stream
+    void* new_buf = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
+
+    auto size_buf = std::make_pair(sizeInBytes, new_buf);
+
+    buffers_.insert(std::make_pair(stream, size_buf));
+
+    return new_buf;
+  };
+};
+
+GlobalWorkspace* GlobalWorkspace::getGlobalWorkspacePtr() {
+  if (singleton_ == nullptr)
+    singleton_ = new GlobalWorkspace();
+
+  return singleton_;
+};
+
+GlobalWorkspace* GlobalWorkspace::singleton_ = nullptr;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h b/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h
new file mode 100644
index 000000000..9b1322f0e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <map>
+#include <mutex>
+
+#include <c10/hip/HIPCachingAllocator.h>
+#include <ck/ck.hpp>
+
+class GlobalWorkspace {
+ private:
+  static GlobalWorkspace* singleton_;
+
+  std::map<hipStream_t, std::pair<size_t, void*>> buffers_;
+  std::mutex mtx_;
+
+ protected:
+  GlobalWorkspace();
+
+ public:
+  // for each stream, we assume only one workspace buffer is needed, so
+  // next allocation will implicitly de-allocate or reuse previous allocation
+  // for this stream
+  void* allocate(size_t sizeInBytes, hipStream_t stream);
+
+  static GlobalWorkspace* getGlobalWorkspacePtr();
+
+  GlobalWorkspace(const GlobalWorkspace&) = delete;
+  GlobalWorkspace(GlobalWorkspace&&) = delete;
+  GlobalWorkspace& operator=(const GlobalWorkspace&) = delete;
+  GlobalWorkspace& operator=(GlobalWorkspace&&) = delete;
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 6c96673e5..1cc4d358a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -244,9 +244,15 @@ void grouped_forward_masktype_attnbias_dispatched(
       param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
       std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
 
-  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+  auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
 
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+  void* workspace =
+      GlobalWorkspace::getGlobalWorkspacePtr()->allocate(sizeInBytes, stream);
+
+  fprintf(stdout, "\n[host]output pointer: %p\n", param.out_ptrs[0]);
+  fprintf(stdout, "\n[host]workspace pointer: %p\n", workspace);
+
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace);
 
   if (!op.IsSupportedArgument(arg_ptr.get())) {
     std::ostringstream ostr;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
index ffc53514b..3ca1f1325 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
@@ -1,9 +1,13 @@
 #pragma once
 
-#include <torch/torch.h>
+#include <map>
+#include <mutex>
 
+#include <c10/hip/HIPCachingAllocator.h>
 #include <ck/utility/data_type.hpp>
 
+#include "ck_fmha_global_workspace_allocator.h"
+
 template <typename scalar_t>
 struct MaxVectorSizeForType {
   static constexpr int value = 4;
@@ -21,17 +25,17 @@ struct MaxVectorSizeForType<ck::bhalf_t> {
 
 struct SimpleDeviceMem {
   SimpleDeviceMem() = delete;
-  SimpleDeviceMem(std::size_t mem_size) {
-    auto options = torch::TensorOptions();
-    mem = at::empty(
-        mem_size, options.dtype(at::ScalarType::Byte).device(torch::kCUDA));
+  SimpleDeviceMem(size_t sizeInBytes) {
+    pData_ = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
   }
   void* GetDeviceBuffer() {
-    return mem.data_ptr();
+    return pData_;
+  }
+  ~SimpleDeviceMem() {
+    c10::cuda::HIPCachingAllocator::raw_delete(pData_);
   }
-  ~SimpleDeviceMem() {}
 
-  at::Tensor mem;
+  void* pData_;
 };
 
 // useful aliasing for making the codes easy

From 57126e6fb953e2cc567dc56b2a30d1426e7cdc45 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Sep 2023 00:03:22 +0000
Subject: [PATCH 056/641] Add tests/test_ck_3.py for temporary CUDAGraph
 hacking

---
 tests/test_ck_3.py | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
index 2c6e42860..31f096615 100644
--- a/tests/test_ck_3.py
+++ b/tests/test_ck_3.py
@@ -10,6 +10,8 @@
 import pytest
 import torch
 
+from functools import partial
+
 ## need to FIX
 ##from scipy.stats import binomtest
 from torch.utils.checkpoint import checkpoint
@@ -339,25 +341,25 @@ def create_tensors(
 
 ## The same set of supported attn_bias types as defined by ck.FwOp
 SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        type(None),
-        torch.Tensor,
-        fmha.attn_bias.LowerTriangularMask,
-        fmha.attn_bias.LowerTriangularMaskWithTensorBias,
+        ##type(None),
+        ##torch.Tensor,
+        ##fmha.attn_bias.LowerTriangularMask,
+        #fmha.attn_bias.LowerTriangularMaskWithTensorBias,
         fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        ##fmha.attn_bias.BlockDiagonalCausalMask,
+        ##fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        ##fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
         }
 
 @pytest.mark.parametrize("bias_type", SUPPORTED_ATTN_BIAS_TYPES)
-@pytest.mark.parametrize("packed", [False, True])
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("packed", [True])
+@pytest.mark.parametrize("fmt", ["BMHK"])
+@pytest.mark.parametrize("dtype", [torch.half])
 def test_forward(dtype, fmt, packed, bias_type):
     op = fmha.ck.FwOp
     device = torch.device("cuda")
     batch_size = 7
-    q_len = 200
+    q_len = 100
 
     ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
     if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
@@ -411,13 +413,14 @@ def test_forward(dtype, fmt, packed, bias_type):
             # bm3hk -> 3 x bmhk
             query, key, value = xformers.ops.unbind(c, 2)
 
-        print("The query shaped for packed: ", query.size())
         assert not query.is_contiguous()
 
+    '''
     out = xformers.ops.memory_efficient_attention_forward(
         query, key, value, attn_bias, op=op
     )
     assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+
     out2 = xformers.ops.memory_efficient_attention_forward(
         query, key, value, attn_bias, op=op
     )
@@ -434,4 +437,15 @@ def test_forward(dtype, fmt, packed, bias_type):
         atol=op.ERROR_ATOL[dtype],
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
+    '''
+
+    fn = partial(xformers.ops.memory_efficient_attention_forward, op=op)
+
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        fn(query, key, value, attn_bias)
+
+    print("\nExecuting the replaying...\n")
+
+    graph.replay()
 

From a457412e0301e42cd106a3a2c43b62b47581bc4c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Sep 2023 00:38:36 +0000
Subject: [PATCH 057/641] Revert "add available condition"

This reverts commit 07b889c4161f3b3ff0c26a8cb123b7d5135df36f.
---
 xformers/ops/fmha/dispatch.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 1376e6766..7bcdcbabb 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
+
 import textwrap
 from collections import deque
 from typing import List, Sequence, Type, TypeVar
@@ -74,13 +74,13 @@ def _dispatch_fw(inp: Inputs, needs_gradient: bool) -> Type[AttentionFwOpBase]:
     """
 
     priority_list_ops = deque(
-        [op for op in [
+        [
             flash.FwOp,
             triton.FwOp,
-            ck.FwOp,
             cutlass.FwOp,
+            ck.FwOp,
             small_k.FwOp,
-        ] if op.is_available()]
+        ]
     )
     if _is_cutlass_fwd_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.FwOp)
@@ -104,15 +104,14 @@ def _is_cutlassB_faster_than_flash(inp: Inputs) -> bool:
 
 
 def _dispatch_bw(inp: Inputs) -> Type[AttentionBwOpBase]:
-    priority_list_ops: List[Type[AttentionBwOpBase]] = [op for op in [
+    priority_list_ops: List[Type[AttentionBwOpBase]] = [
         flash.BwOp,
-        ck.BwOp,
         cutlass.BwOp,
         # CUDA illegal memory issues, race conditions etc..
         # triton.BwOp,
         # Deprecated
         small_k.BwOp,
-    ] if op.is_available()]
+    ]
     if _is_cutlassB_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.BwOp)
         priority_list_ops.insert(0, cutlass.BwOp)

From 85f0ea8bed0067a956a993fe5754b357760ee0fd Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Sep 2023 00:39:40 +0000
Subject: [PATCH 058/641] Revert "add ck into dispatch"

This reverts commit d16fd612274ceda387590cbd1ce6acdafaeaa196.
---
 xformers/ops/fmha/dispatch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 7bcdcbabb..3ed6dd1cb 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -8,7 +8,7 @@
 from collections import deque
 from typing import List, Sequence, Type, TypeVar
 
-from . import cutlass, decoder, flash, small_k, triton, ck
+from . import cutlass, decoder, flash, small_k, triton
 from .common import AttentionBwOpBase, AttentionFwOpBase, Inputs
 
 
@@ -78,7 +78,6 @@ def _dispatch_fw(inp: Inputs, needs_gradient: bool) -> Type[AttentionFwOpBase]:
             flash.FwOp,
             triton.FwOp,
             cutlass.FwOp,
-            ck.FwOp,
             small_k.FwOp,
         ]
     )

From 2b499519315b19296b5f1393997a210db8c561bc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Sep 2023 18:03:45 +0000
Subject: [PATCH 059/641] Remove debugging info and useless script

---
 tests/test_ck_3.py                            | 451 ------------------
 .../hip_fmha/attention_forward_generic.cpp    |  20 -
 .../hip_fmha/ck_fmha_grouped_forward.h        |   3 -
 3 files changed, 474 deletions(-)
 delete mode 100644 tests/test_ck_3.py

diff --git a/tests/test_ck_3.py b/tests/test_ck_3.py
deleted file mode 100644
index 31f096615..000000000
--- a/tests/test_ck_3.py
+++ /dev/null
@@ -1,451 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Set, Any
-
-import pytest
-import torch
-
-from functools import partial
-
-## need to FIX
-##from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
-
-import xformers.ops
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-
-from tests.utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        ##`small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-## The same set of supported attn_bias types as defined by ck.FwOp
-SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        ##type(None),
-        ##torch.Tensor,
-        ##fmha.attn_bias.LowerTriangularMask,
-        #fmha.attn_bias.LowerTriangularMaskWithTensorBias,
-        fmha.attn_bias.BlockDiagonalMask,
-        ##fmha.attn_bias.BlockDiagonalCausalMask,
-        ##fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        ##fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        }
-
-@pytest.mark.parametrize("bias_type", SUPPORTED_ATTN_BIAS_TYPES)
-@pytest.mark.parametrize("packed", [True])
-@pytest.mark.parametrize("fmt", ["BMHK"])
-@pytest.mark.parametrize("dtype", [torch.half])
-def test_forward(dtype, fmt, packed, bias_type):
-    op = fmha.ck.FwOp
-    device = torch.device("cuda")
-    batch_size = 7
-    q_len = 100
-
-    ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
-    if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-        kv_len = int(q_len * 1.2) 
-    else:
-        kv_len = q_len
-    h = 3 
-    k = 64 
-    kv = 64
-
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    ## packed type always creates the tensors in "BMHK" even the fmt is "BMK", so for packed type, one
-    ## should always assume h is already merged in B, and set h to be 1
-    if packed and fmt is "BMK" and batch_size > 1 and h > 1:
-        pytest.skip("Shape of this is type is skipped")
-
-    query, key, value, attn_bias = create_tensors(
-        op, device, dtype, bias_type, batch_size, q_len, kv_len, h, k, kv, fmt="BMHK" if packed else fmt
-    )
-
-    ## when packed, the query, key, value is in BMHK format
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        else:
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-
-        assert not query.is_contiguous()
-
-    '''
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-    '''
-
-    fn = partial(xformers.ops.memory_efficient_attention_forward, op=op)
-
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        fn(query, key, value, attn_bias)
-
-    print("\nExecuting the replaying...\n")
-
-    graph.replay()
-
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index eb4263536..d3e740f98 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -100,22 +100,6 @@ efficient_attention_forward_ck(
   int64_t K = query.size(-1);
   int64_t Kv = value.size(-1);
 
-  fprintf(
-      stdout,
-      "query data pointer %p, size %lx\n",
-      query.data_ptr(),
-      at::numel(query));
-  fprintf(
-      stdout,
-      "key data pointer %p, size %lx\n",
-      key.data_ptr(),
-      at::numel(key));
-  fprintf(
-      stdout,
-      "value data pointer %p, size %lx\n",
-      value.data_ptr(),
-      at::numel(value));
-
   at::Tensor out;
   at::Tensor logsumexp;
   at::Tensor randvals;
@@ -185,8 +169,6 @@ efficient_attention_forward_ck(
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
-      fprintf(stdout, "bias is not empty!\n");
-
       p.has_attn_bias = true;
       p.attn_bias_ptr = bias->data_ptr();
 
@@ -267,8 +249,6 @@ efficient_attention_forward_ck(
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
-      fprintf(stdout, "bias is not empty!\n");
-
       p.has_attn_bias = true;
       const at::Tensor bias_4d_view =
           get_bias_4d_view(*bias, B, num_heads, M, N);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 1cc4d358a..213de60ed 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -249,9 +249,6 @@ void grouped_forward_masktype_attnbias_dispatched(
   void* workspace =
       GlobalWorkspace::getGlobalWorkspacePtr()->allocate(sizeInBytes, stream);
 
-  fprintf(stdout, "\n[host]output pointer: %p\n", param.out_ptrs[0]);
-  fprintf(stdout, "\n[host]workspace pointer: %p\n", workspace);
-
   op.SetWorkSpacePointer(arg_ptr.get(), workspace);
 
   if (!op.IsSupportedArgument(arg_ptr.get())) {

From ea2398741be3ea25024b77732a9d6942fcad33d3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 14 Sep 2023 18:52:53 +0000
Subject: [PATCH 060/641] Restrict the registeration of the attention operators
 according to their required cuda/rocm platform

---
 xformers/csrc/attention/attention.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index a837d1c19..d60114aa3 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -21,6 +21,7 @@ PyMODINIT_FUNC PyInit__C(void) {
 #endif // defined(_WIN32)
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_small_k(Tensor query, Tensor key, Tensor value, bool compute_logsumexp, Tensor? attn_bias, float p) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
@@ -35,10 +36,13 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::_temp_dropout(Tensor out, float p) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
+#endif
+#if defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
+#endif
 }

From d3f90630765de9404201dedcd14baed90d6f963c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 18 Sep 2023 16:43:58 +0000
Subject: [PATCH 061/641] Update to make seqstart_q/seqstart_k/seqlen_k inputs
 of efficient_attention_forward_ck CPU tensor

---
 .../hip_fmha/attention_forward_generic.cpp    | 78 +++++++++----------
 .../hip_fmha/ck_fmha_grouped_forward.h        |  1 -
 .../csrc/attention/hip_fmha/ck_fmha_util.h    |  5 ++
 xformers/ops/fmha/attn_bias.py                | 32 +++++---
 xformers/ops/fmha/ck.py                       |  6 +-
 5 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index d3e740f98..f7a786359 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -12,6 +12,8 @@
 #include <torch/library.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 
+#include <ck/host_utility/hip_check_error.hpp>
+
 #include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
@@ -79,8 +81,8 @@ efficient_attention_forward_ck(
     TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
     TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
     TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
     TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
     TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
   };
@@ -91,7 +93,7 @@ efficient_attention_forward_ck(
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
 
   // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+  hipStream_t stream2 = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
   int64_t M = query.size(1);
@@ -100,10 +102,11 @@ efficient_attention_forward_ck(
   int64_t K = query.size(-1);
   int64_t Kv = value.size(-1);
 
-  at::Tensor out;
   at::Tensor logsumexp;
   at::Tensor randvals;
 
+  at::Tensor out = at::empty({B, M, num_heads, Kv}, query.options());
+
   const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
   int64_t philox_seed;
   int64_t philox_offset;
@@ -265,30 +268,25 @@ efficient_attention_forward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    auto seqstart_q_cpu = seqstart_q->to(at::kCPU);
-    auto seqstart_k_cpu = seqstart_k->to(at::kCPU);
-
     for (int i = 0; i < p.host_seqstart_q.size(); i++)
       p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q_cpu.data_ptr()) + i);
+          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
 
     for (int i = 0; i < p.host_seqstart_k.size(); i++)
       p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k_cpu.data_ptr()) + i);
+          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
       TORCH_CHECK(seqlen_k->dim() == 1);
       TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqlen_k));
+      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
 
       p.host_seqlen_k.resize(p.num_batches);
 
-      auto seqlen_k_cpu = seqlen_k->to(at::kCPU);
-
       for (int i = 0; i < p.host_seqlen_k.size(); i++)
         p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k_cpu.data_ptr()) + i);
+            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
     }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
@@ -369,35 +367,31 @@ efficient_attention_forward_ck(
     };
   };
 
-  DISPATCH_TYPES(query.scalar_type(), [&]() {
-    out = at::empty(
-        {B, M, num_heads, Kv},
-        query.options().dtype(CkToAtenDtype<scalar_t>::atScalarType()));
-
-    if (!seqstart_q.has_value()) { // input is batched
-      BatchedForwardParams batched_forward_params;
-
-      set_batched_forward_params(batched_forward_params);
-
-      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
-        batched_forward_fp16(batched_forward_params, stream);
-      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
-        batched_forward_bp16(batched_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else { // input is grouped
-      GroupedForwardParams grouped_forward_params;
-
-      set_grouped_forward_params(grouped_forward_params);
-
-      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
-        grouped_forward_fp16(grouped_forward_params, stream);
-      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
-        grouped_forward_bp16(grouped_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    }
-  });
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedForwardParams batched_forward_params;
+
+    set_batched_forward_params(batched_forward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      batched_forward_fp16(batched_forward_params, stream2);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      batched_forward_bp16(batched_forward_params, stream2);
+    } else
+      throw std::runtime_error("input data-type is not supported!");
+  } else { // input is grouped
+    GroupedForwardParams grouped_forward_params;
+
+    set_grouped_forward_params(grouped_forward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      grouped_forward_fp16(grouped_forward_params, stream2);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      grouped_forward_bp16(grouped_forward_params, stream2);
+    } else
+      throw std::runtime_error("input data-type is not supported!");
+  };
 
   // torch::save(randvals, "randvals_dev.zip");
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 213de60ed..2aa554bab 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -260,5 +260,4 @@ void grouped_forward_masktype_attnbias_dispatched(
   }
 
   (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  (void)hipStreamSynchronize(stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 36465e34c..84e185967 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -67,6 +67,11 @@ struct CkToAtenDtype<float> {
   XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
   XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
 
+#define CHECK_NOSPARSE_CONTIGUOUS_CPU(TENSOR)                             \
+  XFORMERS_CHECK(TENSOR.is_cpu(), #TENSOR " must be a CPU tensor");       \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
+
 #define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
   XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
   XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
diff --git a/xformers/ops/fmha/attn_bias.py b/xformers/ops/fmha/attn_bias.py
index 8e419c830..80fbea6a0 100644
--- a/xformers/ops/fmha/attn_bias.py
+++ b/xformers/ops/fmha/attn_bias.py
@@ -212,6 +212,7 @@ class _PaddedSeqLenInfo(_SeqLenInfo):
     """
 
     seqlen: torch.Tensor
+    seqlen_cpu: torch.Tensor
     seqlen_py: Sequence[int]
     padding: int
     # From parent: seqstart[i] contains the start position
@@ -246,15 +247,28 @@ def from_seqlens_padded(
         assert not isinstance(seqlens, torch.Tensor)
         assert all(seqlen <= padding for seqlen in seqlens)
         seqstart_py = list(range(0, len(seqlens) * padding + 1, padding))
-        return cls(
-            seqlen=torch.tensor(seqlens, dtype=torch.int32),
-            seqlen_py=seqlens,
-            max_seqlen=max(seqlens),
-            min_seqlen=min(seqlens),
-            seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
-            seqstart_py=seqstart_py,
-            padding=padding,
-        )
+        seqlen = torch.tensor(seqlens, dtype=torch.int32)
+        if torch.cuda.is_available() and torch.version.hip:
+            return cls(
+                seqlen=seqlen,
+                seqlen_cpu=seqlen.to(device=torch.device("cpu")),
+                seqlen_py=seqlens,
+                max_seqlen=max(seqlens),
+                min_seqlen=min(seqlens),
+                seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
+                seqstart_py=seqstart_py,
+                padding=padding,
+            )
+        else:
+            return cls(
+                seqlen=seqlen,
+                seqlen_py=seqlens,
+                max_seqlen=max(seqlens),
+                min_seqlen=min(seqlens),
+                seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
+                seqstart_py=seqstart_py,
+                padding=padding,
+            )
 
     def split(
         self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index f11762422..ad5575f57 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -39,8 +39,8 @@ def _get_seqlen_info(
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
-        attn_bias.k_seqinfo.to(inp.query.device)
-        attn_bias.q_seqinfo.to(inp.query.device)
+        ##attn_bias.k_seqinfo.to(inp.query.device)
+        ##attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         ##max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
@@ -182,7 +182,7 @@ def apply(
             compute_logsumexp=needs_gradient,
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
-            seqlen_k=inp.attn_bias.k_seqinfo.seqlen
+            seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
         )

From d8b5076fd5d975774a537dcec58201e1cbb393e2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 18 Sep 2023 17:00:36 +0000
Subject: [PATCH 062/641] Update to efficient_attention_backward_ck

---
 .../hip_fmha/attention_backward_generic.cpp   | 67 +++++++++----------
 .../hip_fmha/ck_fmha_batched_backward.h       |  1 -
 .../hip_fmha/ck_fmha_grouped_backward.h       |  1 -
 3 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index c75027705..3db5acc3f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -95,8 +95,8 @@ efficient_attention_backward_ck(
     TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
     TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
     TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
     TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
     TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
   }
@@ -265,30 +265,25 @@ efficient_attention_backward_ck(
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
-    auto seqstart_q_cpu = seqstart_q->to(at::kCPU);
-    auto seqstart_k_cpu = seqstart_k->to(at::kCPU);
-
     for (int i = 0; i < p.host_seqstart_q.size(); i++)
       p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q_cpu.data_ptr()) + i);
+          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
 
     for (int i = 0; i < p.host_seqstart_k.size(); i++)
       p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k_cpu.data_ptr()) + i);
+          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
       TORCH_CHECK(seqlen_k->dim() == 1);
       TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqlen_k));
+      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
 
       p.host_seqlen_k.resize(p.num_batches);
 
-      auto seqlen_k_cpu = seqlen_k->to(at::kCPU);
-
       for (int i = 0; i < p.host_seqlen_k.size(); i++)
         p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k_cpu.data_ptr()) + i);
+            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
     }
 
     char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
@@ -354,31 +349,31 @@ efficient_attention_backward_ck(
     }
   };
 
-  DISPATCH_TYPES(query.scalar_type(), [&]() {
-    if (!seqstart_q.has_value()) { // input is batched
-      BatchedBackwardParams batched_backward_params;
-
-      set_batched_backward_params(batched_backward_params);
-
-      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
-        batched_backward_fp16(batched_backward_params, stream);
-      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
-        batched_backward_bp16(batched_backward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported");
-    } else { // input is grouped
-      GroupedBackwardParams grouped_backward_params;
-
-      set_grouped_backward_params(grouped_backward_params);
-
-      if constexpr (std::is_same<scalar_t, ck::half_t>::value) {
-        grouped_backward_fp16(grouped_backward_params, stream);
-      } else if constexpr (std::is_same<scalar_t, ck::bhalf_t>::value) {
-        grouped_backward_bp16(grouped_backward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported");
-    }
-  });
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedBackwardParams batched_backward_params;
+
+    set_batched_backward_params(batched_backward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      batched_backward_fp16(batched_backward_params, stream);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      batched_backward_bp16(batched_backward_params, stream);
+    } else
+      throw std::runtime_error("input data-type is not supported");
+  } else { // input is grouped
+    GroupedBackwardParams grouped_backward_params;
+
+    set_grouped_backward_params(grouped_backward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      grouped_backward_fp16(grouped_backward_params, stream);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      grouped_backward_bp16(grouped_backward_params, stream);
+    } else
+      throw std::runtime_error("input data-type is not supported");
+  }
 
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 #endif
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 04cce9ddb..18a070acf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -248,5 +248,4 @@ void batched_backward_masktype_attnbias_dispatched(
   }
 
   (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  (void)hipStreamSynchronize(stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index a7c268ceb..e215d98aa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -249,5 +249,4 @@ void grouped_backward_masktype_attnbias_dispatched(
   }
 
   (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  (void)hipStreamSynchronize(stream);
 };

From 150e181d71fdec0766f8befd0c8c80da1134687f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 18 Sep 2023 17:02:52 +0000
Subject: [PATCH 063/641] Renaming in efficient_attention_forward_ck

---
 .../attention/hip_fmha/attention_forward_generic.cpp   | 10 +++++-----
 .../csrc/attention/hip_fmha/ck_fmha_batched_forward.h  |  2 --
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index f7a786359..dab15209e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -93,7 +93,7 @@ efficient_attention_forward_ck(
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
 
   // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream2 = at::cuda::getCurrentHIPStream().stream();
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
   int64_t B = query.size(0);
   int64_t M = query.size(1);
@@ -375,9 +375,9 @@ efficient_attention_forward_ck(
     set_batched_forward_params(batched_forward_params);
 
     if (inDataType == at::ScalarType::Half) {
-      batched_forward_fp16(batched_forward_params, stream2);
+      batched_forward_fp16(batched_forward_params, stream);
     } else if (inDataType == at::ScalarType::BFloat16) {
-      batched_forward_bp16(batched_forward_params, stream2);
+      batched_forward_bp16(batched_forward_params, stream);
     } else
       throw std::runtime_error("input data-type is not supported!");
   } else { // input is grouped
@@ -386,9 +386,9 @@ efficient_attention_forward_ck(
     set_grouped_forward_params(grouped_forward_params);
 
     if (inDataType == at::ScalarType::Half) {
-      grouped_forward_fp16(grouped_forward_params, stream2);
+      grouped_forward_fp16(grouped_forward_params, stream);
     } else if (inDataType == at::ScalarType::BFloat16) {
-      grouped_forward_bp16(grouped_forward_params, stream2);
+      grouped_forward_bp16(grouped_forward_params, stream);
     } else
       throw std::runtime_error("input data-type is not supported!");
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 154e2027b..f5b5dd8d9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -264,6 +264,4 @@ void batched_forward_masktype_attnbias_dispatched(
   }
 
   invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-
-  (void)hipStreamSynchronize(stream);
 };

From 975434226127be2e485045701a9288d3d156a7b5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 18 Sep 2023 17:13:41 +0000
Subject: [PATCH 064/641] Simplification in attn_bias.py

---
 xformers/ops/fmha/attn_bias.py | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/xformers/ops/fmha/attn_bias.py b/xformers/ops/fmha/attn_bias.py
index 80fbea6a0..584b09cb9 100644
--- a/xformers/ops/fmha/attn_bias.py
+++ b/xformers/ops/fmha/attn_bias.py
@@ -248,27 +248,16 @@ def from_seqlens_padded(
         assert all(seqlen <= padding for seqlen in seqlens)
         seqstart_py = list(range(0, len(seqlens) * padding + 1, padding))
         seqlen = torch.tensor(seqlens, dtype=torch.int32)
-        if torch.cuda.is_available() and torch.version.hip:
-            return cls(
-                seqlen=seqlen,
-                seqlen_cpu=seqlen.to(device=torch.device("cpu")),
-                seqlen_py=seqlens,
-                max_seqlen=max(seqlens),
-                min_seqlen=min(seqlens),
-                seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
-                seqstart_py=seqstart_py,
-                padding=padding,
-            )
-        else:
-            return cls(
-                seqlen=seqlen,
-                seqlen_py=seqlens,
-                max_seqlen=max(seqlens),
-                min_seqlen=min(seqlens),
-                seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
-                seqstart_py=seqstart_py,
-                padding=padding,
-            )
+        return cls(
+            seqlen=seqlen,
+            seqlen_cpu=seqlen.to(device=torch.device("cpu")) if torch.cuda.is_available() and torch.version.hip else None,
+            seqlen_py=seqlens,
+            max_seqlen=max(seqlens),
+            min_seqlen=min(seqlens),
+            seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
+            seqstart_py=seqstart_py,
+            padding=padding,
+        )
 
     def split(
         self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None

From 8c0492a10af0827ab088337c19c6efb3b5b7e23e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 18 Sep 2023 19:38:04 +0000
Subject: [PATCH 065/641] Fix the offset type in
 efficient_attention_forward_ck() and efficient_attention_backward_ck()

---
 .../benchmark_mem_eff_attn_decoder_ck.py      |  2 +-
 .../hip_fmha/attention_backward_generic.cpp   | 62 ++++++++++---------
 .../hip_fmha/attention_forward_generic.cpp    | 40 +++++++-----
 3 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index c700109e9..a44c81891 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -65,7 +65,7 @@ def T(t):
 KV_SHAPES = [
     # list of n_keys, padding_length, batchsize
     (2, 64, 3),
-    ##(32, 1024, 500),  // this one fails due to consuming too much GPU memory
+    (32, 1024, 500),  
     (1000, 1024, 2),
     (8000, 8192, 1),
     (240, 256, 32),
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 3db5acc3f..bae86c6fe 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -302,50 +302,56 @@ efficient_attention_backward_ck(
     char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
 
     for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_q_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
-      int32_t tmp_k_stride = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
-      int32_t tmp_v_stride = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
-      int32_t tmp_o_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
-      int32_t tmp_grad_o_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.grad_out_strides[0], grad_out.scalar_type());
-      int32_t tmp_logsumexp_stride =
+      size_t tmp_q_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
+          query.scalar_type());
+      size_t tmp_k_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
+          key.scalar_type());
+      size_t tmp_v_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
+          value.scalar_type());
+      size_t tmp_o_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
+          out.scalar_type());
+      size_t tmp_grad_o_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.grad_out_strides[0],
+          grad_out.scalar_type());
+      size_t tmp_logsumexp_offset =
           get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
-      int32_t tmp_randvals_stride = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.randvals_strides[1] +
-              p.host_seqstart_k[i] * p.randvals_strides[2],
+      size_t tmp_randvals_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.randvals_strides[1] +
+              static_cast<size_t>(p.host_seqstart_k[i]) * p.randvals_strides[2],
           randvals.scalar_type());
 
-      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_stride]));
+      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.grad_q_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_stride]));
-      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_stride]));
+          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset]));
+      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
       p.grad_k_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_k_ptr[tmp_k_stride]));
-      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_stride]));
+          reinterpret_cast<void*>(&grad_k_ptr[tmp_k_offset]));
+      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
       p.grad_v_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_v_ptr[tmp_v_stride]));
-      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_stride]));
+          reinterpret_cast<void*>(&grad_v_ptr[tmp_v_offset]));
+      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
       p.grad_out_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_out_ptr[tmp_grad_o_stride]));
+          reinterpret_cast<void*>(&grad_out_ptr[tmp_grad_o_offset]));
 
       if (bias.has_value()) {
-        int32_t tmp_bias_stride = get_size_in_bytes(
-            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
-                p.host_seqstart_k[i] * p.attn_bias_strides[3],
+        size_t tmp_bias_offset = get_size_in_bytes(
+            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.attn_bias_strides[3],
             bias->scalar_type());
 
         p.attn_bias_ptrs.push_back(
-            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_stride]));
+            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
       };
 
       p.logsumexp_ptrs.push_back(
-          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_stride]));
+          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
       p.randvals_ptrs.push_back(
-          reinterpret_cast<void*>(&randvals_ptr[tmp_randvals_stride]));
+          reinterpret_cast<void*>(&randvals_ptr[tmp_randvals_offset]));
     }
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index dab15209e..470a253ca 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -298,14 +298,18 @@ efficient_attention_forward_ck(
         bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
 
     for (int i = 0; i < p.num_batches; i++) {
-      int32_t tmp_q_offset = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.q_strides[0], query.scalar_type());
-      int32_t tmp_k_offset = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.k_strides[0], key.scalar_type());
-      int32_t tmp_v_offset = get_size_in_bytes(
-          p.host_seqstart_k[i] * p.v_strides[0], value.scalar_type());
-      int32_t tmp_o_offset = get_size_in_bytes(
-          p.host_seqstart_q[i] * p.out_strides[0], out.scalar_type());
+      size_t tmp_q_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
+          query.scalar_type());
+      size_t tmp_k_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
+          key.scalar_type());
+      size_t tmp_v_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
+          value.scalar_type());
+      size_t tmp_o_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
+          out.scalar_type());
 
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
@@ -313,9 +317,10 @@ efficient_attention_forward_ck(
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
 
       if (bias.has_value()) {
-        int32_t tmp_bias_offset = get_size_in_bytes(
-            p.host_seqstart_q[i] * p.attn_bias_strides[2] +
-                p.host_seqstart_k[i] * p.attn_bias_strides[3],
+        size_t tmp_bias_offset = get_size_in_bytes(
+            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.attn_bias_strides[3],
             bias->scalar_type());
 
         p.attn_bias_ptrs.push_back(
@@ -341,13 +346,14 @@ efficient_attention_forward_ck(
       char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
 
       for (int i = 0; i < p.num_batches; i++) {
-        int32_t tmp_randvals_stride = get_size_in_bytes(
-            p.host_seqstart_q[i] * p.randvals_strides[1] +
-                p.host_seqstart_k[i] * p.randvals_strides[2],
+        size_t tmp_randvals_offset = get_size_in_bytes(
+            static_cast<size_t>(p.host_seqstart_q[i]) * p.randvals_strides[1] +
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.randvals_strides[2],
             randvals.scalar_type());
 
         p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
-        randvals_ptr = randvals_ptr + tmp_randvals_stride;
+        randvals_ptr = randvals_ptr + tmp_randvals_offset;
       };
     } else
       p.dropout_prob = 0.0f;
@@ -358,11 +364,11 @@ efficient_attention_forward_ck(
       char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
       for (int i = 0; i < p.num_batches; i++) {
-        int32_t tmp_logsumexp_stride =
+        size_t tmp_logsumexp_offset =
             get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
 
         p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
-        logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_stride;
+        logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_offset;
       };
     };
   };

From fb0e501ea35cf41a9932c615c11154ead2d89983 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 19 Sep 2023 00:17:04 +0000
Subject: [PATCH 066/641] Remove the using of global workspace allocator

---
 .../ck_fmha_global_workspace_allocator.cpp    | 44 -------------------
 .../ck_fmha_global_workspace_allocator.h      | 31 -------------
 .../hip_fmha/ck_fmha_grouped_forward.h        |  5 +--
 .../attention/hip_fmha/ck_fmha_op_helper.h    |  2 -
 4 files changed, 2 insertions(+), 80 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp
deleted file mode 100644
index 0382aa24b..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "ck_fmha_global_workspace_allocator.h"
-
-GlobalWorkspace::GlobalWorkspace(){};
-
-void* GlobalWorkspace::allocate(size_t sizeInBytes, hipStream_t stream) {
-  std::lock_guard<std::mutex> lck(mtx_);
-
-  auto it = buffers_.find(stream);
-
-  if (it != buffers_.end()) {
-    size_t curr_size = it->second.first;
-
-    // if requested size is bigger than existing buffer, allocate a bigger
-    // buffer; else re-use the existing buffer
-    if (curr_size < sizeInBytes) {
-      c10::cuda::HIPCachingAllocator::raw_delete(it->second.second);
-
-      void* new_buf = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
-      it->second.first = sizeInBytes;
-      it->second.second = new_buf;
-
-      return new_buf;
-    } else
-      return it->second.second;
-  } else {
-    // allocate a buffer and keep it for the stream
-    void* new_buf = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
-
-    auto size_buf = std::make_pair(sizeInBytes, new_buf);
-
-    buffers_.insert(std::make_pair(stream, size_buf));
-
-    return new_buf;
-  };
-};
-
-GlobalWorkspace* GlobalWorkspace::getGlobalWorkspacePtr() {
-  if (singleton_ == nullptr)
-    singleton_ = new GlobalWorkspace();
-
-  return singleton_;
-};
-
-GlobalWorkspace* GlobalWorkspace::singleton_ = nullptr;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h b/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h
deleted file mode 100644
index 9b1322f0e..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_global_workspace_allocator.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include <map>
-#include <mutex>
-
-#include <c10/hip/HIPCachingAllocator.h>
-#include <ck/ck.hpp>
-
-class GlobalWorkspace {
- private:
-  static GlobalWorkspace* singleton_;
-
-  std::map<hipStream_t, std::pair<size_t, void*>> buffers_;
-  std::mutex mtx_;
-
- protected:
-  GlobalWorkspace();
-
- public:
-  // for each stream, we assume only one workspace buffer is needed, so
-  // next allocation will implicitly de-allocate or reuse previous allocation
-  // for this stream
-  void* allocate(size_t sizeInBytes, hipStream_t stream);
-
-  static GlobalWorkspace* getGlobalWorkspacePtr();
-
-  GlobalWorkspace(const GlobalWorkspace&) = delete;
-  GlobalWorkspace(GlobalWorkspace&&) = delete;
-  GlobalWorkspace& operator=(const GlobalWorkspace&) = delete;
-  GlobalWorkspace& operator=(GlobalWorkspace&&) = delete;
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 2aa554bab..0d902ebf6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -246,10 +246,9 @@ void grouped_forward_masktype_attnbias_dispatched(
 
   auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
 
-  void* workspace =
-      GlobalWorkspace::getGlobalWorkspacePtr()->allocate(sizeInBytes, stream);
+  SimpleDeviceMem workspace(sizeInBytes);
 
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace);
+  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
 
   if (!op.IsSupportedArgument(arg_ptr.get())) {
     std::ostringstream ostr;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
index 3ca1f1325..84d585a29 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
@@ -6,8 +6,6 @@
 #include <c10/hip/HIPCachingAllocator.h>
 #include <ck/utility/data_type.hpp>
 
-#include "ck_fmha_global_workspace_allocator.h"
-
 template <typename scalar_t>
 struct MaxVectorSizeForType {
   static constexpr int value = 4;

From adf3d1cf2521f3d061eb9edbbd77e9f876d67e85 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 19 Sep 2023 17:54:22 +0000
Subject: [PATCH 067/641] Update to composable_kernel latest mha-train-develop

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 12dcba200..f04ec5749 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 12dcba200a082ae40a0fb5aca3f093f1cc3470c7
+Subproject commit f04ec5749ef7db484032d0e4b6ce5135bb824ac5

From ae516c785cf2ab25928e04204c3d6cfaa52b0ba5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 23 Sep 2023 01:05:57 +0000
Subject: [PATCH 068/641] Synchronize attention_backward_generic.cpp to latest
 CK commits with grad_bias support added

---
 third_party/composable_kernel                 |  2 +-
 .../hip_fmha/attention_backward_generic.cpp   | 23 ++++++++++++++++++-
 .../hip_fmha/ck_fmha_batched_backward.h       |  2 ++
 .../hip_fmha/ck_fmha_grouped_backward.h       |  2 ++
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  6 +++--
 5 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index f04ec5749..c0c522688 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit f04ec5749ef7db484032d0e4b6ce5135bb824ac5
+Subproject commit c0c522688d6d7e292faa62a0a5326204d2c7a168
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index bae86c6fe..c4b821a9e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -117,6 +117,11 @@ efficient_attention_backward_ck(
   grad_k = at::empty(key.sizes(), key.options());
   grad_v = at::empty(value.sizes(), value.options());
 
+  const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
+
+  if (bias_requires_grad)
+    grad_bias = at::empty(value.sizes(), value.options());
+
   at::Tensor randvals;
 
   auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
@@ -177,8 +182,16 @@ efficient_attention_backward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
-    } else
+
+      if (bias_requires_grad)
+        p.grad_bias_ptr = grad_bias.data_ptr();
+    } else {
+      p.has_attn_bias = true;
       p.attn_bias_ptr = nullptr;
+      p.grad_bias_ptr = nullptr;
+    }
+
+    p.bias_has_grad = bias_requires_grad;
 
     p.custom_mask_type = custom_mask_type;
 
@@ -249,6 +262,8 @@ efficient_attention_backward_ck(
     } else
       p.has_attn_bias = false;
 
+    p.bias_has_grad = bias_requires_grad;
+
     p.dropout_prob = static_cast<float>(dropout_p);
     p.philox_seed = rng_seed;
     p.philox_offset = rng_offset;
@@ -300,6 +315,7 @@ efficient_attention_backward_ck(
     char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
     char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
     char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
+    char* grad_bias_ptr = reinterpret_cast<char*>(grad_bias.data_ptr());
 
     for (int i = 0; i < p.num_batches; i++) {
       size_t tmp_q_offset = get_size_in_bytes(
@@ -346,6 +362,11 @@ efficient_attention_backward_ck(
 
         p.attn_bias_ptrs.push_back(
             reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+
+        if (bias_requires_grad) {
+          p.grad_bias_ptrs.push_back(
+              reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
+        };
       };
 
       p.logsumexp_ptrs.push_back(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 18a070acf..79e160646 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -213,6 +213,8 @@ void batched_backward_masktype_attnbias_dispatched(
       param.grad_v_ptr,
       param.has_attn_bias ? param.attn_bias_ptr : nullptr,
       nullptr, // p_acc1_bias
+      param.bias_has_grad ? param.grad_bias_ptr : nullptr,
+      nullptr,
       q_gs_ms_ks_lengths,
       q_gs_ms_ks_strides,
       k_gs_ns_ks_lengths,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index e215d98aa..d31256467 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -228,6 +228,8 @@ void grouped_backward_masktype_attnbias_dispatched(
       param.grad_v_ptrs,
       param.attn_bias_ptrs,
       {}, // p_acc1_bias_vec;
+      param.grad_bias_ptrs,
+      {},
       problem_descs,
       QKVElementOp{},
       QKVElementOp{},
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index b48f6fa8f..609f774ff 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -106,6 +106,7 @@ struct BatchedBackwardParams {
 
   float scale;
   bool has_attn_bias;
+  bool bias_has_grad;
 
   // BMHK mode strides, last-dim contiguous
   std::array<int, 4> q_strides;
@@ -129,7 +130,7 @@ struct BatchedBackwardParams {
   void* grad_q_ptr;
   void* grad_k_ptr;
   void* grad_v_ptr;
-  // void* grad_bias_ptr;
+  void* grad_bias_ptr;
 
   float dropout_prob;
   int64_t philox_seed;
@@ -157,6 +158,7 @@ struct GroupedBackwardParams {
 
   float scale;
   bool has_attn_bias;
+  bool bias_has_grad;
 
   // MHK mode strides, last-dim contiguous
   std::array<int, 3> q_strides;
@@ -181,7 +183,7 @@ struct GroupedBackwardParams {
   std::vector<void*> grad_q_ptrs;
   std::vector<void*> grad_k_ptrs;
   std::vector<void*> grad_v_ptrs;
-  // std::vector<void *> grad_bias_ptrs;
+  std::vector<void*> grad_bias_ptrs;
 
   float dropout_prob;
   int64_t philox_seed;

From e00c33fc0a5ccfeb89bff1d329157855b3bfa50e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 26 Sep 2023 20:05:55 +0000
Subject: [PATCH 069/641] Add max_seqlen_q parameter to
 efficient_attention_forward_ck()

---
 third_party/composable_kernel                 |  2 +-
 xformers/csrc/attention/attention.cpp         |  2 +-
 .../hip_fmha/attention_forward_generic.cpp    | 16 +++++++++----
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  2 ++
 xformers/ops/fmha/ck.py                       | 24 ++++++++-----------
 5 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index c0c522688..04c206da8 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit c0c522688d6d7e292faa62a0a5326204d2c7a168
+Subproject commit 04c206da8afe745e1b33197234155e703aadd715
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index d60114aa3..b136f2141 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -39,7 +39,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
 #endif
 #if defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
+      "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 470a253ca..90370f2d2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -50,6 +50,7 @@ efficient_attention_forward_ck(
     // position of the first key token for batch $b
     const c10::optional<at::Tensor>& seqstart_k,
     // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
     double dropout_p, // attention matrix dropout probability
     bool compute_logsumexp,
     int64_t custom_mask_type,
@@ -85,6 +86,7 @@ efficient_attention_forward_ck(
     CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
     TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
     TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
   };
 
   // last dim is contiguous, device is kCUDA
@@ -211,7 +213,8 @@ efficient_attention_forward_ck(
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty(
-          {B, num_heads, M}, query.options().dtype(at::ScalarType::Float));
+          {B, num_heads, M},
+          at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
       p.logsumexp_ptr = logsumexp.data_ptr();
     } else
       p.logsumexp_ptr = nullptr;
@@ -265,6 +268,9 @@ efficient_attention_forward_ck(
 
     p.custom_mask_type = custom_mask_type;
 
+    // max_seqlen_q is used to create logsumexp tensor
+    p.max_seqlen_q = *max_seqlen_q_;
+
     p.host_seqstart_q.resize(p.num_batches + 1);
     p.host_seqstart_k.resize(p.num_batches + 1);
 
@@ -360,12 +366,14 @@ efficient_attention_forward_ck(
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty(
-          {num_heads, M}, query.options().dtype(at::ScalarType::Float));
+          {p.num_batches, num_heads, p.max_seqlen_q},
+          at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
       char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
       for (int i = 0; i < p.num_batches; i++) {
-        size_t tmp_logsumexp_offset =
-            get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
+        size_t tmp_logsumexp_offset = get_size_in_bytes(
+            static_cast<size_t>(i) * num_heads * p.max_seqlen_q,
+            logsumexp.scalar_type());
 
         p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
         logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_offset;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 609f774ff..4b782cc00 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -55,6 +55,8 @@ struct GroupedInferParams {
   int K; // embed_dim for Query and Key
   int Kv; // embed_dim for Value
 
+  int max_seqlen_q;
+
   std::vector<int> host_seqstart_q;
   std::vector<int> host_seqstart_k;
   std::vector<int> host_seqlen_k;
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index ad5575f57..a6c76f996 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -39,20 +39,15 @@ def _get_seqlen_info(
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
-        ##attn_bias.k_seqinfo.to(inp.query.device)
-        ##attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
-        ##max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
-        ##max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
+        max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
     else:
         seqstart_k = None
         seqstart_q = None
-        ##max_seqlen_q = -1
-        ##max_seqlen_k = -1
-
-    return seqstart_k, seqstart_q
+        max_seqlen_q = -1
 
+    return seqstart_k, seqstart_q, max_seqlen_q
 
 def _get_tensor_bias(
     attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
@@ -170,7 +165,7 @@ def apply(
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
-        seqstart_k, seqstart_q  = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q  = _get_seqlen_info(inp)
         out, lse, rng_seed, rng_offset = cls.OPERATOR(
             query=inp.query,
             key=inp.key,
@@ -178,6 +173,7 @@ def apply(
             attn_bias=_get_tensor_bias(inp.attn_bias),
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
+            max_seqlen_q=max_seqlen_q,
             dropout_p=inp.p,
             compute_logsumexp=needs_gradient,
             custom_mask_type=_custom_mask_type(inp.attn_bias),
@@ -247,8 +243,7 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
-        # TODO: Fix handling of gradient through the fMHA autograd function
-        # LowerTriangularMaskWithTensorBias,
+        LowerTriangularMaskWithTensorBias,
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
@@ -324,7 +319,6 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
                 raise NotImplementedError(f"Invalid rng_state: {ctx.rng_state}")
             rng_seed, rng_offset = ctx.rng_state.tolist()
 
-        force_pad_inf = torch.cuda.get_device_capability(inp.query.device) == (7, 5)
         (grad_q, grad_k, grad_v, grad_bias) = cls.OPERATOR(
             grad.to(dtype),
             inp.query,
@@ -333,8 +327,10 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             attn_bias=_get_tensor_bias(inp.attn_bias),
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
-            seqlen_k=None,
-            logsumexp=ctx.get_padded_lse(32, force_pad_inf=force_pad_inf),
+            seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
+            if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+            else None,
+            logsumexp=ctx.lse,
             output=ctx.out.to(dtype),
             dropout_p=inp.p,
             # if not using dropout, seed and offset are irrelevant but still expected

From bf5f193b68b3d774a83d00ca4f19a82fe378e85f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 26 Sep 2023 21:13:56 +0000
Subject: [PATCH 070/641] Remove the randvals ptr/ptrs from
 efficient_attention_forward/backward since they are not used

---
 .../hip_fmha/attention_backward_generic.cpp   | 28 ++----------
 .../hip_fmha/attention_forward_generic.cpp    | 44 +++----------------
 .../hip_fmha/ck_fmha_batched_backward.h       | 14 ++----
 .../hip_fmha/ck_fmha_batched_forward.h        | 21 ++-------
 .../hip_fmha/ck_fmha_grouped_backward.h       | 11 +----
 .../hip_fmha/ck_fmha_grouped_forward.h        | 11 +----
 .../csrc/attention/hip_fmha/ck_fmha_params.h  | 18 ++------
 7 files changed, 24 insertions(+), 123 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index c4b821a9e..89016ef02 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -122,8 +122,6 @@ efficient_attention_backward_ck(
   if (bias_requires_grad)
     grad_bias = at::empty(value.sizes(), value.options());
 
-  at::Tensor randvals;
-
   auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
     p.B = B;
     p.M = M;
@@ -199,15 +197,6 @@ efficient_attention_backward_ck(
     p.philox_seed = rng_seed;
     p.philox_offset = rng_offset;
 
-    randvals = at::empty(
-        {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2)),
-        static_cast<int>(randvals.stride(3))};
-    p.randvals_ptr = randvals.data_ptr();
-
     p.logsumexp_ptr = logsumexp.data_ptr();
   };
 
@@ -268,13 +257,6 @@ efficient_attention_backward_ck(
     p.philox_seed = rng_seed;
     p.philox_offset = rng_offset;
 
-    randvals = at::empty(
-        {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-    p.randvals_strides = {
-        static_cast<int>(randvals.stride(0)),
-        static_cast<int>(randvals.stride(1)),
-        static_cast<int>(randvals.stride(2))};
-
     p.custom_mask_type = custom_mask_type;
 
     p.host_seqstart_q.resize(p.num_batches + 1);
@@ -310,7 +292,6 @@ efficient_attention_backward_ck(
     char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
 
     char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-    char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
 
     char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
     char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
@@ -335,10 +316,6 @@ efficient_attention_backward_ck(
           grad_out.scalar_type());
       size_t tmp_logsumexp_offset =
           get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
-      size_t tmp_randvals_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.randvals_strides[1] +
-              static_cast<size_t>(p.host_seqstart_k[i]) * p.randvals_strides[2],
-          randvals.scalar_type());
 
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.grad_q_ptrs.push_back(
@@ -371,8 +348,9 @@ efficient_attention_backward_ck(
 
       p.logsumexp_ptrs.push_back(
           reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-      p.randvals_ptrs.push_back(
-          reinterpret_cast<void*>(&randvals_ptr[tmp_randvals_offset]));
+
+      // ToDO: remove this after dev-op fix
+      p.randvals_ptrs.push_back(nullptr);
     }
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 90370f2d2..2490ac839 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -105,7 +105,6 @@ efficient_attention_forward_ck(
   int64_t Kv = value.size(-1);
 
   at::Tensor logsumexp;
-  at::Tensor randvals;
 
   at::Tensor out = at::empty({B, M, num_heads, Kv}, query.options());
 
@@ -195,21 +194,10 @@ efficient_attention_forward_ck(
     p.compute_logsumexp = compute_logsumexp;
 
     // the following parameters are only used by training forward
-    if (p.use_dropout) {
+    if (p.use_dropout)
       p.dropout_prob = static_cast<float>(dropout_p);
-
-      randvals = at::empty(
-          {B, num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-      p.randvals_strides = {
-          static_cast<int>(randvals.stride(0)),
-          static_cast<int>(randvals.stride(1)),
-          static_cast<int>(randvals.stride(2)),
-          static_cast<int>(randvals.stride(3))};
-      p.randvals_ptr = randvals.data_ptr();
-    } else {
+    else
       p.dropout_prob = 0.0f;
-      p.randvals_ptr = nullptr;
-    };
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty(
@@ -332,6 +320,9 @@ efficient_attention_forward_ck(
         p.attn_bias_ptrs.push_back(
             reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
       };
+
+      // ToDO: remove this after dev-op fix
+      p.randvals_ptrs.push_back(nullptr);
     }
 
     p.use_dropout = use_dropout;
@@ -340,28 +331,9 @@ efficient_attention_forward_ck(
     p.compute_logsumexp = compute_logsumexp;
 
     // the following parameters are only used by training forward
-    if (p.use_dropout) {
+    if (p.use_dropout)
       p.dropout_prob = static_cast<float>(dropout_p);
-
-      randvals = at::empty(
-          {num_heads, M, N}, query.options().dtype(at::ScalarType::Short));
-      p.randvals_strides = {
-          static_cast<int>(randvals.stride(0)),
-          static_cast<int>(randvals.stride(1)),
-          static_cast<int>(randvals.stride(2))};
-      char* randvals_ptr = reinterpret_cast<char*>(randvals.data_ptr());
-
-      for (int i = 0; i < p.num_batches; i++) {
-        size_t tmp_randvals_offset = get_size_in_bytes(
-            static_cast<size_t>(p.host_seqstart_q[i]) * p.randvals_strides[1] +
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.randvals_strides[2],
-            randvals.scalar_type());
-
-        p.randvals_ptrs.push_back(reinterpret_cast<void*>(randvals_ptr));
-        randvals_ptr = randvals_ptr + tmp_randvals_offset;
-      };
-    } else
+    else
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
@@ -407,8 +379,6 @@ efficient_attention_forward_ck(
       throw std::runtime_error("input data-type is not supported!");
   };
 
-  // torch::save(randvals, "randvals_dev.zip");
-
   return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 79e160646..c9a44499f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -170,14 +170,6 @@ void batched_backward_masktype_attnbias_dispatched(
   std::vector<ck::index_t> ygrad_gs_ms_os_lengths{
       param.B, param.num_heads, param.M, param.Kv};
 
-  std::vector<ck::index_t> z_gs_ms_ns_lengths{
-      param.B, param.num_heads, param.M, param.N};
-  std::vector<ck::index_t> z_gs_ms_ns_strides{
-      param.randvals_strides[0],
-      param.randvals_strides[1],
-      param.randvals_strides[2],
-      param.randvals_strides[3]};
-
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
   std::vector<ck::index_t> d_gs_ms_ns_lengths;
@@ -203,7 +195,7 @@ void batched_backward_masktype_attnbias_dispatched(
   auto arg_ptr = op.MakeArgumentPointer(
       param.q_ptr,
       param.k_ptr,
-      param.randvals_ptr,
+      nullptr,
       param.v_ptr,
       param.out_ptr,
       param.logsumexp_ptr,
@@ -219,8 +211,8 @@ void batched_backward_masktype_attnbias_dispatched(
       q_gs_ms_ks_strides,
       k_gs_ns_ks_lengths,
       k_gs_ns_ks_strides,
-      z_gs_ms_ns_lengths,
-      z_gs_ms_ns_strides,
+      {1, 1, 1, 1},
+      {0, 0, 0, 0},
       v_gs_os_ns_lengths,
       v_gs_os_ns_strides,
       y_gs_ms_os_lengths,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index f5b5dd8d9..e6015c6bc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -174,21 +174,6 @@ void batched_forward_masktype_attnbias_dispatched(
       param.out_strides[1],
       param.out_strides[3]};
 
-  std::vector<ck::index_t> z_gs_ms_ns_lengths;
-  std::vector<ck::index_t> z_gs_ms_ns_strides;
-
-  if (param.use_dropout) {
-    z_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
-    z_gs_ms_ns_strides = {
-        param.randvals_strides[0],
-        param.randvals_strides[1],
-        param.randvals_strides[2],
-        param.randvals_strides[3]};
-  } else {
-    z_gs_ms_ns_lengths = {1, 1, 1, 1};
-    z_gs_ms_ns_strides = {0, 0, 0, 0};
-  };
-
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
   std::vector<ck::index_t> d_gs_ms_ns_lengths;
@@ -222,7 +207,7 @@ void batched_forward_masktype_attnbias_dispatched(
       param.k_ptr,
       param.v_ptr,
       param.out_ptr,
-      param.randvals_ptr,
+      nullptr,
       param.logsumexp_ptr,
       param.has_attn_bias ? param.attn_bias_ptr : nullptr,
       {}, // p_acc1_biases;
@@ -234,8 +219,8 @@ void batched_forward_masktype_attnbias_dispatched(
       b1_gs_os_ns_strides,
       c_gs_ms_os_lengths,
       c_gs_ms_os_strides,
-      z_gs_ms_ns_lengths,
-      z_gs_ms_ns_strides,
+      {1, 1, 1, 1},
+      {0, 0, 0, 0},
       lse_gs_ms_lengths,
       d_gs_ms_ns_lengths,
       d_gs_ms_ns_strides,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index d31256467..ba7fbe71e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -164,13 +164,6 @@ void grouped_backward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> y_gs_ms_os_strides{
         0, param.out_strides[0], param.out_strides[1], param.out_strides[2]};
 
-    std::vector<ck::index_t> z_gs_ms_ns_lengths{1, G1, M, N};
-    std::vector<ck::index_t> z_gs_ms_ns_strides{
-        0,
-        param.randvals_strides[0],
-        param.randvals_strides[1],
-        param.randvals_strides[2]};
-
     std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
 
@@ -195,8 +188,8 @@ void grouped_backward_masktype_attnbias_dispatched(
         q_gs_ms_ks_strides,
         k_gs_ns_ks_lengths,
         k_gs_ns_ks_strides,
-        z_gs_ms_ns_lengths,
-        z_gs_ms_ns_strides,
+        {1, 1, 1, 1},
+        {0, 0, 0, 0},
         v_gs_os_ns_lengths,
         v_gs_os_ns_strides,
         y_gs_ms_os_lengths,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 0d902ebf6..49f3c47e5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -170,13 +170,6 @@ void grouped_forward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> c_gs_ms_os_strides{
         0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-    std::vector<ck::index_t> z_gs_ms_ns_lengths{1, G1, M, N};
-    std::vector<ck::index_t> z_gs_ms_ns_strides{
-        0,
-        param.randvals_strides[0],
-        param.randvals_strides[1],
-        param.randvals_strides[2]};
-
     std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
 
@@ -205,8 +198,8 @@ void grouped_forward_masktype_attnbias_dispatched(
          b1_gs_os_ns_strides,
          c_gs_ms_os_lengths,
          c_gs_ms_os_strides,
-         z_gs_ms_ns_lengths,
-         z_gs_ms_ns_strides,
+         {1, 1, 1, 1},
+         {0, 0, 0, 0},
          lse_gs_ms_lengths,
          lse_gs_ms_strides,
          d_gs_ms_ns_lengths,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 4b782cc00..ccea06a1c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -39,10 +39,6 @@ struct BatchedForwardParams : public BatchedInferParams {
   int64_t philox_seed;
   int64_t philox_offset;
 
-  // BHMN mode strides, completely contiguous
-  std::array<int32_t, 4> randvals_strides;
-  void* randvals_ptr;
-
   // completely contiguous
   void* logsumexp_ptr;
 };
@@ -90,12 +86,11 @@ struct GroupedForwardParams : public GroupedInferParams {
   int64_t philox_seed;
   int64_t philox_offset;
 
-  // HMN mode strides, completely contiguous
-  std::array<int, 3> randvals_strides;
-  std::vector<void*> randvals_ptrs;
-
   // completely contiguous
   std::vector<void*> logsumexp_ptrs;
+
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
 };
 
 struct BatchedBackwardParams {
@@ -140,10 +135,6 @@ struct BatchedBackwardParams {
 
   // completely contiguous
   const void* logsumexp_ptr;
-
-  // BHMN mode strides, completely contiguous
-  std::array<int, 4> randvals_strides;
-  void* randvals_ptr;
 };
 
 struct GroupedBackwardParams {
@@ -194,7 +185,6 @@ struct GroupedBackwardParams {
   // HM mode strides, completely contiguous
   std::vector<const void*> logsumexp_ptrs;
 
-  // HMN mode strides, completely contiguous
-  std::array<int, 3> randvals_strides;
+  // TODO: need remove this after dev-op fix
   std::vector<void*> randvals_ptrs;
 };

From dc71d806930ccf9671e3d8687081174c2bb087f3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 27 Sep 2023 20:05:53 +0000
Subject: [PATCH 071/641] Updates and have some batched backward testing cases
 passed

---
 third_party/composable_kernel                 |  2 +-
 xformers/csrc/attention/attention.cpp         |  2 +-
 .../hip_fmha/attention_backward_generic.cpp   | 53 ++++++++++++-------
 .../hip_fmha/attention_forward_generic.cpp    |  5 +-
 .../hip_fmha/ck_fmha_batched_backward.h       | 19 +++----
 .../hip_fmha/ck_fmha_grouped_backward.h       |  4 +-
 .../hip_fmha/ck_fmha_grouped_forward.h        |  4 +-
 .../csrc/attention/hip_fmha/ck_fmha_params.h  | 16 +++---
 xformers/ops/fmha/ck.py                       |  5 +-
 9 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 04c206da8..b23b3d717 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 04c206da8afe745e1b33197234155e703aadd715
+Subproject commit b23b3d717ab17a06c490b70508d18ef7773849a4
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index b136f2141..18ddcdcfc 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -41,7 +41,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
+      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 89016ef02..3808ae35e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -38,6 +38,8 @@ efficient_attention_backward_ck(
     // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
     // position of the first key token for batch $b
     const c10::optional<at::Tensor>& seqstart_k,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
     const c10::optional<at::Tensor>& seqlen_k,
     const at::Tensor& logsumexp,
     const at::Tensor& out,
@@ -78,14 +80,19 @@ efficient_attention_backward_ck(
   TORCH_CHECK(query.size(3) == key.size(3));
   TORCH_CHECK(value.size(3) == grad_out.size(3));
 
-  // handle potentially non-contiguous grad_out through a copy
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
+  // CK-FlashAttn requires out, grad_out to have same shapes
+  TORCH_CHECK(out.sizes() == grad_out.sizes());
+  TORCH_CHECK(out.strides() == grad_out.strides());
 
   // last dim is contiguous, device is CUDA
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
 
+  // logsumexp should be completely contiguous
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
+
   TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
   TORCH_CHECK(
       !(seqstart_q.has_value() && bias.has_value()),
@@ -99,6 +106,7 @@ efficient_attention_backward_ck(
     CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
     TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
     TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
   }
 
   // at::cuda::CUDAGuard device_guard(query.device());
@@ -113,14 +121,14 @@ efficient_attention_backward_ck(
 
   at::Tensor grad_q, grad_k, grad_v, grad_bias;
 
-  grad_q = at::empty(query.sizes(), query.options());
+  grad_q = at::zeros(query.sizes(), query.options());
   grad_k = at::empty(key.sizes(), key.options());
   grad_v = at::empty(value.sizes(), value.options());
 
   const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
 
   if (bias_requires_grad)
-    grad_bias = at::empty(value.sizes(), value.options());
+    grad_bias = at::empty(bias->sizes(), bias->options());
 
   auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
     p.B = B;
@@ -130,6 +138,10 @@ efficient_attention_backward_ck(
     p.K = K;
     p.Kv = Kv;
 
+    TORCH_CHECK(p.B == logsumexp.size(0));
+    TORCH_CHECK(p.num_heads == logsumexp.size(1));
+    TORCH_CHECK(p.M == logsumexp.size(2));
+
     if (scale.has_value()) {
       p.scale = float(*scale);
     } else {
@@ -140,6 +152,8 @@ efficient_attention_backward_ck(
     p.k_ptr = key.data_ptr();
     p.v_ptr = value.data_ptr();
     p.grad_out_ptr = grad_out.data_ptr();
+    p.out_ptr = out.data_ptr();
+
     p.grad_q_ptr = grad_q.data_ptr();
     p.grad_k_ptr = grad_k.data_ptr();
     p.grad_v_ptr = grad_v.data_ptr();
@@ -159,11 +173,11 @@ efficient_attention_backward_ck(
         static_cast<int>(value.stride(1)),
         static_cast<int>(value.stride(2)),
         static_cast<int>(value.stride(3))};
-    p.grad_out_strides = {
-        static_cast<int>(grad_out.stride(0)),
-        static_cast<int>(grad_out.stride(1)),
-        static_cast<int>(grad_out.stride(2)),
-        static_cast<int>(grad_out.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
@@ -208,6 +222,12 @@ efficient_attention_backward_ck(
     p.K = K;
     p.Kv = Kv;
 
+    p.max_seqlen_q = *max_seqlen_q_;
+
+    TORCH_CHECK(p.num_batches == logsumexp.size(0));
+    TORCH_CHECK(p.num_heads == logsumexp.size(1));
+    TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
+
     if (scale.has_value()) {
       p.scale = float(*scale);
     } else {
@@ -231,11 +251,6 @@ efficient_attention_backward_ck(
         static_cast<int>(out.stride(2)),
         static_cast<int>(out.stride(3))};
 
-    p.grad_out_strides = {
-        static_cast<int>(grad_out.stride(1)),
-        static_cast<int>(grad_out.stride(2)),
-        static_cast<int>(grad_out.stride(3))};
-
     if (bias.has_value()) {
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
@@ -311,11 +326,9 @@ efficient_attention_backward_ck(
       size_t tmp_o_offset = get_size_in_bytes(
           static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
           out.scalar_type());
-      size_t tmp_grad_o_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.grad_out_strides[0],
-          grad_out.scalar_type());
-      size_t tmp_logsumexp_offset =
-          get_size_in_bytes(p.host_seqstart_q[i], logsumexp.scalar_type());
+      size_t tmp_logsumexp_offset = get_size_in_bytes(
+          static_cast<size_t>(i) * p.num_heads * p.max_seqlen_q,
+          logsumexp.scalar_type());
 
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.grad_q_ptrs.push_back(
@@ -328,7 +341,7 @@ efficient_attention_backward_ck(
           reinterpret_cast<void*>(&grad_v_ptr[tmp_v_offset]));
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
       p.grad_out_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_out_ptr[tmp_grad_o_offset]));
+          reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
 
       if (bias.has_value()) {
         size_t tmp_bias_offset = get_size_in_bytes(
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 2490ac839..1c7035cc0 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -346,9 +346,8 @@ efficient_attention_forward_ck(
         size_t tmp_logsumexp_offset = get_size_in_bytes(
             static_cast<size_t>(i) * num_heads * p.max_seqlen_q,
             logsumexp.scalar_type());
-
-        p.logsumexp_ptrs.push_back(reinterpret_cast<void*>(logsumexp_ptr));
-        logsumexp_ptr = logsumexp_ptr + tmp_logsumexp_offset;
+        p.logsumexp_ptrs.push_back(
+            reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
       };
     };
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index c9a44499f..360c87651 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -56,7 +56,7 @@ void batched_backward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecY =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
+  static constexpr bool Deterministic = true;
 
   // Tunables
   static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
@@ -167,9 +167,6 @@ void batched_backward_masktype_attnbias_dispatched(
       param.out_strides[1],
       param.out_strides[3]};
 
-  std::vector<ck::index_t> ygrad_gs_ms_os_lengths{
-      param.B, param.num_heads, param.M, param.Kv};
-
   std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
 
   std::vector<ck::index_t> d_gs_ms_ns_lengths;
@@ -195,7 +192,7 @@ void batched_backward_masktype_attnbias_dispatched(
   auto arg_ptr = op.MakeArgumentPointer(
       param.q_ptr,
       param.k_ptr,
-      nullptr,
+      nullptr, // p_z_grid
       param.v_ptr,
       param.out_ptr,
       param.logsumexp_ptr,
@@ -207,15 +204,15 @@ void batched_backward_masktype_attnbias_dispatched(
       nullptr, // p_acc1_bias
       param.bias_has_grad ? param.grad_bias_ptr : nullptr,
       nullptr,
-      q_gs_ms_ks_lengths,
+      q_gs_ms_ks_lengths, // q, dQ should have same shape
       q_gs_ms_ks_strides,
-      k_gs_ns_ks_lengths,
+      k_gs_ns_ks_lengths, // k, dK should have same shape
       k_gs_ns_ks_strides,
-      {1, 1, 1, 1},
-      {0, 0, 0, 0},
-      v_gs_os_ns_lengths,
+      {1, 1, 1, 1}, // z_gs_ms_ns_lengths
+      {0, 0, 0, 0}, // z_gs_ms_ns_strides
+      v_gs_os_ns_lengths, // v, dV should have same shape
       v_gs_os_ns_strides,
-      y_gs_ms_os_lengths,
+      y_gs_ms_os_lengths, // y, dY should have same shape
       y_gs_ms_os_strides,
       lse_gs_ms_lengths,
       d_gs_ms_ns_lengths,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index ba7fbe71e..fd86be85b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -164,8 +164,8 @@ void grouped_backward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> y_gs_ms_os_strides{
         0, param.out_strides[0], param.out_strides[1], param.out_strides[2]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
-    std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
+    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, param.max_seqlen_q};
+    std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;
     std::vector<ck::index_t> d_gs_ms_ns_strides;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 49f3c47e5..dd2204ac0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -170,8 +170,8 @@ void grouped_forward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> c_gs_ms_os_strides{
         0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
-    std::vector<ck::index_t> lse_gs_ms_strides{0, param.M, 1};
+    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, param.max_seqlen_q};
+    std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;
     std::vector<ck::index_t> d_gs_ms_ns_strides;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index ccea06a1c..2186c7601 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -116,14 +116,11 @@ struct BatchedBackwardParams {
   const void* k_ptr;
   const void* v_ptr;
   const void* attn_bias_ptr;
+  const void* grad_out_ptr;
   const void* out_ptr;
 
   uint8_t custom_mask_type;
 
-  std::array<int, 4> grad_out_strides;
-
-  const void* grad_out_ptr;
-
   void* grad_q_ptr;
   void* grad_k_ptr;
   void* grad_v_ptr;
@@ -133,7 +130,7 @@ struct BatchedBackwardParams {
   int64_t philox_seed;
   int64_t philox_offset;
 
-  // completely contiguous
+  // BHM mode lengths, completely contiguous
   const void* logsumexp_ptr;
 };
 
@@ -145,6 +142,8 @@ struct GroupedBackwardParams {
   int K; // embed_dim for Query and Key
   int Kv; // embed_dim for Value
 
+  int max_seqlen_q;
+
   std::vector<int> host_seqstart_q;
   std::vector<int> host_seqstart_k;
   std::vector<int> host_seqlen_k;
@@ -165,14 +164,11 @@ struct GroupedBackwardParams {
   std::vector<const void*> k_ptrs;
   std::vector<const void*> v_ptrs;
   std::vector<const void*> attn_bias_ptrs;
+  std::vector<const void*> grad_out_ptrs;
   std::vector<const void*> out_ptrs;
 
   uint8_t custom_mask_type;
 
-  std::array<int, 3> grad_out_strides;
-
-  std::vector<const void*> grad_out_ptrs;
-
   std::vector<void*> grad_q_ptrs;
   std::vector<void*> grad_k_ptrs;
   std::vector<void*> grad_v_ptrs;
@@ -182,7 +178,7 @@ struct GroupedBackwardParams {
   int64_t philox_seed;
   int64_t philox_offset;
 
-  // HM mode strides, completely contiguous
+  // BHM mode lengths, completely contiguous
   std::vector<const void*> logsumexp_ptrs;
 
   // TODO: need remove this after dev-op fix
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index a6c76f996..5f201f603 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -165,7 +165,7 @@ def apply(
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
-        seqstart_k, seqstart_q, max_seqlen_q  = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         out, lse, rng_seed, rng_offset = cls.OPERATOR(
             query=inp.query,
             key=inp.key,
@@ -305,7 +305,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
 
-        seqstart_k, seqstart_q = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
         rng_seed = rng_offset = 0
@@ -327,6 +327,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             attn_bias=_get_tensor_bias(inp.attn_bias),
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
+            max_seqlen_q=max_seqlen_q, 
             seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,

From b42396133d95e038b4281d48020b5a37ebc49999 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 5 Oct 2023 12:07:17 +0000
Subject: [PATCH 072/641] Tiny change in fmha_grouped_forward

---
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index dd2204ac0..4ce28c964 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -58,7 +58,7 @@ void grouped_forward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = true;
+  static constexpr bool Deterministic = false;
 
   // Tunables
   static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
@@ -170,7 +170,7 @@ void grouped_forward_masktype_attnbias_dispatched(
     std::vector<ck::index_t> c_gs_ms_os_strides{
         0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, param.max_seqlen_q};
+    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;

From 26c653c6a34db24d1d01ae93954ecade722e8680 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 7 Oct 2023 17:38:37 +0000
Subject: [PATCH 073/641] Add comments in batched backward

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 360c87651..98faf4967 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -215,7 +215,7 @@ void batched_backward_masktype_attnbias_dispatched(
       y_gs_ms_os_lengths, // y, dY should have same shape
       y_gs_ms_os_strides,
       lse_gs_ms_lengths,
-      d_gs_ms_ns_lengths,
+      d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
       d_gs_ms_ns_strides,
       {}, // acc1_biases_gs_ms_os_lengths
       {}, // acc1_biases_gs_ms_os_strides

From 90a2c4282818f8cb61923456e7b8e1c543d24375 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 8 Oct 2023 15:54:29 +0000
Subject: [PATCH 074/641] Update and changes which make simple grouped backward
 tests passed

---
 .../hip_fmha/attention_backward_generic.cpp   | 20 ++++++++-----
 .../hip_fmha/attention_forward_generic.cpp    | 11 ++++---
 .../hip_fmha/ck_fmha_grouped_backward.h       | 30 +++++++++----------
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  4 +++
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 3808ae35e..da9e9db34 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -304,14 +304,17 @@ efficient_attention_backward_ck(
 
     char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
     char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
-    char* attn_bias_ptr = reinterpret_cast<char*>(bias->data_ptr());
+    char* attn_bias_ptr =
+        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
 
     char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
     char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
     char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
     char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
-    char* grad_bias_ptr = reinterpret_cast<char*>(grad_bias.data_ptr());
+    char* grad_bias_ptr = bias_requires_grad
+        ? reinterpret_cast<char*>(grad_bias.data_ptr())
+        : nullptr;
 
     for (int i = 0; i < p.num_batches; i++) {
       size_t tmp_q_offset = get_size_in_bytes(
@@ -333,16 +336,22 @@ efficient_attention_backward_ck(
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.grad_q_ptrs.push_back(
           reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset]));
+
       p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
       p.grad_k_ptrs.push_back(
           reinterpret_cast<void*>(&grad_k_ptr[tmp_k_offset]));
+
       p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
       p.grad_v_ptrs.push_back(
           reinterpret_cast<void*>(&grad_v_ptr[tmp_v_offset]));
+
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
       p.grad_out_ptrs.push_back(
           reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
 
+      p.logsumexp_ptrs.push_back(
+          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+
       if (bias.has_value()) {
         size_t tmp_bias_offset = get_size_in_bytes(
             static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
@@ -356,11 +365,8 @@ efficient_attention_backward_ck(
         if (bias_requires_grad) {
           p.grad_bias_ptrs.push_back(
               reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
-        };
-      };
-
-      p.logsumexp_ptrs.push_back(
-          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+        }
+      }
 
       // ToDO: remove this after dev-op fix
       p.randvals_ptrs.push_back(nullptr);
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 1c7035cc0..166c9806a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -104,9 +104,11 @@ efficient_attention_forward_ck(
   int64_t K = query.size(-1);
   int64_t Kv = value.size(-1);
 
+  auto opts = query.options();
+
   at::Tensor logsumexp;
 
-  at::Tensor out = at::empty({B, M, num_heads, Kv}, query.options());
+  at::Tensor out = at::empty({B, M, num_heads, Kv}, opts);
 
   const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
   int64_t philox_seed;
@@ -200,9 +202,7 @@ efficient_attention_forward_ck(
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
-      logsumexp = at::empty(
-          {B, num_heads, M},
-          at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
+      logsumexp = at::empty({B, num_heads, M}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
     } else
       p.logsumexp_ptr = nullptr;
@@ -338,8 +338,7 @@ efficient_attention_forward_ck(
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty(
-          {p.num_batches, num_heads, p.max_seqlen_q},
-          at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
+          {p.num_batches, num_heads, p.max_seqlen_q}, opts.dtype(at::kFloat));
       char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
       for (int i = 0; i < p.num_batches; i++) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index fd86be85b..5371126d3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -60,9 +60,9 @@ void grouped_backward_masktype_attnbias_dispatched(
   static constexpr bool Deterministic = false;
 
   // Tunables
-  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1; // 8
+  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1; // 4
+  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1; // 4
 
   using DeviceOpInstance = ck::tensor_operation::device::
       DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
@@ -94,7 +94,7 @@ void grouped_backward_masktype_attnbias_dispatched(
           256,
           64, // MPerBlock
           128, // NPerBlock
-          128, // KPerBlock
+          64, // KPerBlock
           128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           64, // Gemm2KPerBlock
@@ -140,7 +140,7 @@ void grouped_backward_masktype_attnbias_dispatched(
 
   for (std::size_t i = 0; i < param.num_batches; i++) {
     int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
-    int N = param.host_seqstart_k.empty()
+    int N = param.host_seqlen_k.empty()
         ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
         : param.host_seqlen_k[i];
     int K = param.K;
@@ -149,22 +149,22 @@ void grouped_backward_masktype_attnbias_dispatched(
 
     std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1, M, K};
     std::vector<ck::index_t> q_gs_ms_ks_strides{
-        0, param.q_strides[0], param.q_strides[1], param.q_strides[2]};
+        0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
 
     std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1, N, K};
     std::vector<ck::index_t> k_gs_ns_ks_strides{
-        0, param.k_strides[0], param.k_strides[1], param.k_strides[2]};
+        0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
 
     // to be changed to v_gs_ns_os_lengths
     std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1, Kv, N};
     std::vector<ck::index_t> v_gs_os_ns_strides{
-        0, param.v_strides[0], param.v_strides[2], param.v_strides[1]};
+        0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
 
     std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1, M, Kv};
     std::vector<ck::index_t> y_gs_ms_os_strides{
-        0, param.out_strides[0], param.out_strides[1], param.out_strides[2]};
+        0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, param.max_seqlen_q};
+    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
     std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;
@@ -184,19 +184,19 @@ void grouped_backward_masktype_attnbias_dispatched(
     };
 
     problem_descs.push_back({
-        q_gs_ms_ks_lengths,
+        q_gs_ms_ks_lengths, // q, dQ should have same shape
         q_gs_ms_ks_strides,
-        k_gs_ns_ks_lengths,
+        k_gs_ns_ks_lengths, // k, dK should have same shape
         k_gs_ns_ks_strides,
         {1, 1, 1, 1},
         {0, 0, 0, 0},
-        v_gs_os_ns_lengths,
+        v_gs_os_ns_lengths, // v, dV should have same shape
         v_gs_os_ns_strides,
-        y_gs_ms_os_lengths,
+        y_gs_ms_os_lengths, // y, dY should have same shape
         y_gs_ms_os_strides,
         lse_gs_ms_lengths,
         lse_gs_ms_strides,
-        d_gs_ms_ns_lengths,
+        d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
         d_gs_ms_ns_strides,
         {}, // acc1_biases_gs_ms_os_lengths
         {}, // acc1_biases_gs_ms_os_strides
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 2186c7601..73961d0a8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -167,6 +167,10 @@ struct GroupedBackwardParams {
   std::vector<const void*> grad_out_ptrs;
   std::vector<const void*> out_ptrs;
 
+  // used by the light_v2 kernel
+  // TODO use these as workspace
+  std::vector<void*> ydotdy_ptrs;
+
   uint8_t custom_mask_type;
 
   std::vector<void*> grad_q_ptrs;

From 05c367e7bf1a5c030825512bed0a261ff0dab0a4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 8 Oct 2023 20:30:34 +0000
Subject: [PATCH 075/641] Tiny update to make some test cases pass

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 98faf4967..601fdbff2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -56,7 +56,7 @@ void batched_backward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecY =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = true;
+  static constexpr bool Deterministic = false;
 
   // Tunables
   static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 5371126d3..e442ae8c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -94,7 +94,7 @@ void grouped_backward_masktype_attnbias_dispatched(
           256,
           64, // MPerBlock
           128, // NPerBlock
-          64, // KPerBlock
+          128, // KPerBlock
           128, // Gemm1NPerBlock
           32, // Gemm1KPerBlock
           64, // Gemm2KPerBlock

From 9a04ba76aed68ddb95359166a715e800e45f36f6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 17:02:38 +0000
Subject: [PATCH 076/641] Update to align the allocation of
 grad_q/grad_k/grad_v with that of q/k/v

---
 .../hip_fmha/attention_backward_generic.cpp   | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index da9e9db34..da1a082b2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -119,16 +119,48 @@ efficient_attention_backward_ck(
   int64_t K = query.size(3);
   int64_t Kv = value.size(3);
 
+  auto opts = query.options();
+
   at::Tensor grad_q, grad_k, grad_v, grad_bias;
 
-  grad_q = at::zeros(query.sizes(), query.options());
-  grad_k = at::empty(key.sizes(), key.options());
-  grad_v = at::empty(value.sizes(), value.options());
+  if (query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
+      query.storage().is_alias_of(key.storage()) &&
+      query.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk for grad_q, grad_k, grad_v
+    // This is because q, k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk = at::empty({B, M, 3, num_heads, K}, opts);
+    grad_q = chunk.select(2, 0);
+    grad_k = chunk.select(2, 1);
+    grad_v = chunk.select(2, 2);
+  } else if (
+      key.size(3) == value.size(3) &&
+      key.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk for grad_k, grad_v
+    // This is because k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk = at::empty({B, N, 2, num_heads, Kv}, opts);
+    grad_k = chunk.select(2, 0);
+    grad_v = chunk.select(2, 1);
+
+    grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+    grad_q.fill_(0);
+  } else {
+    grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+    grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
+    grad_v = at::empty_strided(value.sizes(), key.strides(), value.options());
+    grad_q.fill_(0);
+  }
 
   const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
 
   if (bias_requires_grad)
-    grad_bias = at::empty(bias->sizes(), bias->options());
+    grad_bias =
+        at::empty_strided(bias->sizes(), bias->strides(), bias->options());
 
   auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
     p.B = B;

From e7b7916db90457f68fa62e67a56bca4625dd7ae8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 17:03:49 +0000
Subject: [PATCH 077/641] Add benchmark_mem_eff_attention_ck.py for
 forward/backward benchmarking on CK

---
 .../benchmark_mem_eff_attention_ck.py         | 324 ++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 xformers/benchmarks/benchmark_mem_eff_attention_ck.py

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
new file mode 100644
index 000000000..bd700518d
--- /dev/null
+++ b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
@@ -0,0 +1,324 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import itertools
+import random
+from functools import partial
+
+import torch
+from torch.utils import benchmark
+from xformers.benchmarks.utils import benchmark_main_helper
+
+import xformers.ops
+import xformers.ops.fmha as fmha
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    bias_requires_grad: bool = False,
+):
+    NoneType = type(None)
+    if bias_type is NoneType:
+        return None
+    if bias_type is torch.Tensor:
+        attn_bias = torch.randn((1, 1, q_len, kv_len), device=device, dtype=dtype)
+        return attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+    if bias_type is xformers.ops.LowerTriangularMask:
+        return bias_type()
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def ref_attention_bmk(q, k, v, attn_bias=None, p=0.0):
+    if isinstance(attn_bias, xformers.ops.AttentionMask):
+        attn_bias = (
+            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
+            .to(q)
+            .squeeze()
+        )
+    q = q * (1.0 / q.shape[-1] ** 0.5)
+    if attn_bias is None:
+        attn = q @ k.transpose(-2, -1)
+    else:
+        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
+        # but faster, and is what is used in PyTorch now
+        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
+    attn = attn.softmax(-1)
+    if p > 0:
+        attn = torch.nn.functional.dropout(attn, p=p)
+    return attn @ v
+
+
+def ref_attention(q, k, v, attn_bias, p=0.0):
+    assert q.ndim == 4
+    B, M, H, K = q.shape
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, torch.Tensor):
+        attn_bias = attn_bias.reshape(B * H, M, M)
+    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias, p)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+min_run_time = 0.5
+device = torch.device("cuda")
+
+NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
+SHAPES = [
+    # ViT
+    (384, 197, 1, 88),
+    (384, 197, 1, 80),
+    (384, 197, 1, 64),
+    (1024, 197, 1, 88),
+    (1024, 197, 1, 80),
+    (1024, 197, 1, 64),
+    # ViT-Huge
+    (32 * 16, 197, 1, 80),
+    (32, 197, 16, 80),
+    (32, 197, 16, 64),
+    (32, 197, 16, 128),
+    # ViT-Giant
+    (16 * 16, 197, 1, 88),
+    (16, 197, 16, 88),
+    (16, 197, 16, 64),
+    (16, 197, 16, 128),
+    # FB models
+    (1024, 82, 8, 64),
+    (150, 256, 16, 64),
+    (64, 256, 12, 64),
+    # Stable diffusion (https://github.com/huggingface/diffusers/pull/532)
+    (1, 4096, 16, 40),  # 512x512
+    (1, 16384, 16, 40),  # 1024x1024
+    (1, 4096, 16, 80),
+    #(1, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
+    # + bs4
+    (4, 4096, 16, 40),
+    #(4, 16384, 16, 40),  // disabled on MI250 due to big memory requirement
+    (4, 4096, 16, 80),
+    #(4, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
+    # ParlAI model
+    #(256, 4096, 16, 64), // disabled on MI250 due to big memory requirement
+    # Zetta B M H K
+    (8, 2048, 20, 128),
+    # LLaMa 70b - mp=8/16
+    *sorted(itertools.product([1, 2], [2048, 4096, 8192], [4, 8], [128])),
+    *sorted(
+        ##itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128, 160, 256])
+        ## disabled K/Kv bigger than 128
+        itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128])
+    ),
+]
+
+OPS = [
+    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
+    #(xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
+    # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
+    # and its performance varies a lot between runs.
+    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
+]
+
+
+def product_dict(**kwargs):
+    keys = kwargs.keys()
+    vals = kwargs.values()
+    for instance in itertools.product(*vals):
+        yield dict(zip(keys, instance))
+
+
+CASES = list(
+    product_dict(
+        shape=SHAPES,
+        num_threads=NUM_THREADS,
+        dropout_p=[0.0],
+        attn_bias_cfg=[(type(None), False)],
+        dtype=[torch.half],
+    )
+)
+
+# Add more cases with some variations
+for c in CASES.copy():
+    c = c.copy()
+    c.update(
+        random.Random(str(c["shape"])).choice(
+            [
+                {"dropout_p": 0.3},
+                {"attn_bias_cfg": (torch.Tensor, False)},
+                {"attn_bias_cfg": (torch.Tensor, True)},
+                {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
+                {"dtype": torch.bfloat16},
+                ##{"dtype": torch.float},
+            ]
+        )
+    )
+    CASES.append(c)
+
+
+def create_tensors(shape, dtype, requires_grad=False):
+    B, M, H, K = shape
+    qkv = torch.rand(
+        [B, M, 3, H, K], device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    q, k, v = xformers.ops.unbind(qkv, 2)
+    return qkv, q, k, v
+
+def create_discrete_tensors(shape, dtype, requires_grad=False):
+    B, M, H, K = shape
+    q = torch.rand([B, M, H, K], device=device, dtype=dtype, requires_grad=requires_grad)
+    k = torch.rand([B, M, H, K], device=device, dtype=dtype, requires_grad=requires_grad)
+    v = torch.rand([B, M, H, K], device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return q, k, v
+
+def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+    B, M, H, K = shape
+    _, q, k, v = create_tensors(shape, dtype)
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    if attn_bias_requires_grad:
+        return
+    bias = create_attn_bias(
+        attn_bias_type,
+        batch_size=B,
+        num_heads=H,
+        q_len=M,
+        kv_len=M,
+        device=device,
+        dtype=dtype,
+        bias_requires_grad=attn_bias_requires_grad,
+    )
+    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}"
+    )
+
+    has_run = False
+    for fw_op, bw_op in OPS:
+        if not fw_op.supports(inp):
+            continue
+
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias, p)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": inp.attn_bias,
+                "p": dropout_p,
+                "fn": partial(
+                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
+                ),
+            },
+            label=f"attention (attn_bias={attn_bias_type})",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        has_run = True
+
+    if not has_run:
+        return
+
+    yield benchmark.Timer(
+        stmt="fn(q, k, v, attn_bias, p)",
+        globals={
+            "q": q,
+            "k": k,
+            "v": v,
+            "attn_bias": inp.attn_bias,
+            "p": dropout_p,
+            "fn": ref_attention,
+        },
+        label=f"attention (attn_bias={attn_bias_type})",
+        description="eager",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+
+
+def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+    B, M, H, K = shape
+    _, q, k, v = create_tensors(shape, dtype, requires_grad=True)
+
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    bias = create_attn_bias(
+        attn_bias_type,
+        batch_size=B,
+        num_heads=H,
+        q_len=M,
+        kv_len=M,
+        device=device,
+        dtype=dtype,
+        bias_requires_grad=attn_bias_requires_grad,
+    )
+    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}, BiasGrad={attn_bias_requires_grad}"
+    )
+
+    has_run = False
+    for fw_op, bw_op in OPS:
+        if not fw_op.supports(inp) or not bw_op.supports(inp):
+            continue
+        has_run = True
+        out = xformers.ops.memory_efficient_attention(
+            inp.query, inp.key, inp.value, inp.attn_bias, inp.p, op=(fw_op, bw_op)
+        )
+        grad_benchmark = torch.ones_like(q)
+
+        yield benchmark.Timer(
+            stmt="out.backward(grad, retain_graph=True)",
+            globals={
+                "out": out,
+                "grad": grad_benchmark,
+            },
+            label=f"attention backward (attn_bias={attn_bias_type})",
+            description=bw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        del out
+
+    if not has_run:
+        return
+    yield benchmark.Timer(
+        stmt="out.backward(grad, retain_graph=True)",
+        globals={
+            "out": ref_attention(q, k, v, inp.attn_bias, dropout_p),
+            "grad": grad_benchmark,
+        },
+        label=f"attention backward (attn_bias={attn_bias_type})",
+        description="vanilla",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+
+benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
+benchmark_main_helper(mem_eff_attention_bw, CASES, min_run_time=min_run_time)

From 56e936f38dc1cfdcc0f3a8439db2aac4370e941d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 18:09:58 +0000
Subject: [PATCH 078/641] Using classes for dispatched execution

---
 .../hip_fmha/ck_fmha_batched_backward.h       | 367 ++++++++---------
 .../ck_fmha_batched_backward_bp16.cpp         |  12 +-
 .../ck_fmha_batched_backward_fp16.cpp         |  12 +-
 .../hip_fmha/ck_fmha_batched_forward.h        | 383 ++++++++---------
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp |  12 +-
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp |  12 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       | 377 ++++++++---------
 .../ck_fmha_grouped_backward_bp16.cpp         |  12 +-
 .../ck_fmha_grouped_backward_fp16.cpp         |  12 +-
 .../hip_fmha/ck_fmha_grouped_forward.h        | 388 +++++++++---------
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp |  12 +-
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp |  12 +-
 12 files changed, 807 insertions(+), 804 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 601fdbff2..f87e3fda3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -13,9 +13,7 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void batched_backward_masktype_attnbias_dispatched(
-    BatchedBackwardParams& param,
-    hipStream_t stream) {
+struct batched_backward_masktype_attnbias_dispatched {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
   using Scale = ck::tensor_operation::element_wise::Scale;
 
@@ -58,185 +56,188 @@ void batched_backward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
-  // Tunables
-  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
-
-  using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
-          InputDataType,
-          OutputDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          ShuffleDataType,
-          QKVElementOp,
-          QKVElementOp,
-          Scale,
-          QKVElementOp,
-          YElementOp,
-          GemmSpec,
-          TensorSpecQ,
-          TensorSpecK,
-          TensorSpecV,
-          TensorSpecY,
-          1,
-          256,
-          64, // MPerBlock
-          128, // NPerBlock
-          128, // KPerBlock
-          128, // Gemm1NPerBlock
-          32, // Gemm1KPerBlock
-          64, // Gemm2KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // A1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          2, // MXdlPerWave
-          1, // NXdlPerWave
-          4, // Gemm1NXdlPerWave
-          1, // Gemm2NXdlPerWave
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // B0BlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-          S<8, 32, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
-          4, // CShuffleNXdlPerWavePerShuffle
-          S<1, 32, 1, 8>,
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          MaskingSpec,
-          Deterministic>;
-
-  std::vector<ck::index_t> q_gs_ms_ks_lengths{
-      param.B, param.num_heads, param.M, param.K};
-  std::vector<ck::index_t> q_gs_ms_ks_strides{
-      param.q_strides[0],
-      param.q_strides[2],
-      param.q_strides[1],
-      param.q_strides[3]};
-
-  std::vector<ck::index_t> k_gs_ns_ks_lengths{
-      param.B, param.num_heads, param.N, param.K};
-  std::vector<ck::index_t> k_gs_ns_ks_strides{
-      param.k_strides[0],
-      param.k_strides[2],
-      param.k_strides[1],
-      param.k_strides[3]};
-
-  std::vector<ck::index_t> v_gs_os_ns_lengths{
-      param.B, param.num_heads, param.Kv, param.N};
-  std::vector<ck::index_t> v_gs_os_ns_strides{
-      param.v_strides[0],
-      param.v_strides[2],
-      param.v_strides[3],
-      param.v_strides[1]};
-
-  std::vector<ck::index_t> y_gs_ms_os_lengths{
-      param.B, param.num_heads, param.M, param.Kv};
-  std::vector<ck::index_t> y_gs_ms_os_strides{
-      param.out_strides[0],
-      param.out_strides[2],
-      param.out_strides[1],
-      param.out_strides[3]};
-
-  std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
-
-  std::vector<ck::index_t> d_gs_ms_ns_lengths;
-  std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-  if constexpr (has_attn_bias) {
-    d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
-    d_gs_ms_ns_strides = {
-        param.attn_bias_strides[0],
-        param.attn_bias_strides[1],
-        param.attn_bias_strides[2],
-        param.attn_bias_strides[3]};
-  } else {
-    d_gs_ms_ns_lengths = {1, 1, 1, 1};
-    d_gs_ms_ns_strides = {0, 0, 0, 0};
+  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
+    // Tunables
+    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
+    using DeviceOpInstance = ck::tensor_operation::device::
+        DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            InputDataType,
+            OutputDataType,
+            GemmDataType,
+            ZDataType,
+            LSEDataType,
+            Acc0BiasDataType,
+            Acc1BiasDataType,
+            AccDataType,
+            ShuffleDataType,
+            QKVElementOp,
+            QKVElementOp,
+            Scale,
+            QKVElementOp,
+            YElementOp,
+            GemmSpec,
+            TensorSpecQ,
+            TensorSpecK,
+            TensorSpecV,
+            TensorSpecY,
+            1,
+            256,
+            64, // MPerBlock
+            128, // NPerBlock
+            128, // KPerBlock
+            128, // Gemm1NPerBlock
+            32, // Gemm1KPerBlock
+            64, // Gemm2KPerBlock
+            8, // AK1
+            8, // BK1
+            2, // A1K1
+            32, // MPerXDL
+            32, // NPerXDL
+            2, // MXdlPerWave
+            1, // NXdlPerWave
+            4, // Gemm1NXdlPerWave
+            1, // Gemm2NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            S<4, 64, 1>, // B0BlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            2,
+            false,
+            1, // CShuffleMXdlPerWavePerShuffle
+            4, // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            MaskingSpec,
+            Deterministic>;
+
+    std::vector<ck::index_t> q_gs_ms_ks_lengths{
+        param.B, param.num_heads, param.M, param.K};
+    std::vector<ck::index_t> q_gs_ms_ks_strides{
+        param.q_strides[0],
+        param.q_strides[2],
+        param.q_strides[1],
+        param.q_strides[3]};
+
+    std::vector<ck::index_t> k_gs_ns_ks_lengths{
+        param.B, param.num_heads, param.N, param.K};
+    std::vector<ck::index_t> k_gs_ns_ks_strides{
+        param.k_strides[0],
+        param.k_strides[2],
+        param.k_strides[1],
+        param.k_strides[3]};
+
+    std::vector<ck::index_t> v_gs_os_ns_lengths{
+        param.B, param.num_heads, param.Kv, param.N};
+    std::vector<ck::index_t> v_gs_os_ns_strides{
+        param.v_strides[0],
+        param.v_strides[2],
+        param.v_strides[3],
+        param.v_strides[1]};
+
+    std::vector<ck::index_t> y_gs_ms_os_lengths{
+        param.B, param.num_heads, param.M, param.Kv};
+    std::vector<ck::index_t> y_gs_ms_os_strides{
+        param.out_strides[0],
+        param.out_strides[2],
+        param.out_strides[1],
+        param.out_strides[3]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{
+        param.B, param.num_heads, param.M};
+
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+      d_gs_ms_ns_strides = {
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    };
+
+    float alpha = param.scale;
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptr,
+        param.k_ptr,
+        nullptr, // p_z_grid
+        param.v_ptr,
+        param.out_ptr,
+        param.logsumexp_ptr,
+        param.grad_out_ptr,
+        param.grad_q_ptr,
+        param.grad_k_ptr,
+        param.grad_v_ptr,
+        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+        nullptr, // p_acc1_bias
+        param.bias_has_grad ? param.grad_bias_ptr : nullptr,
+        nullptr,
+        q_gs_ms_ks_lengths, // q, dQ should have same shape
+        q_gs_ms_ks_strides,
+        k_gs_ns_ks_lengths, // k, dK should have same shape
+        k_gs_ns_ks_strides,
+        {1, 1, 1, 1}, // z_gs_ms_ns_lengths
+        {0, 0, 0, 0}, // z_gs_ms_ns_strides
+        v_gs_os_ns_lengths, // v, dV should have same shape
+        v_gs_os_ns_strides,
+        y_gs_ms_os_lengths, // y, dY should have same shape
+        y_gs_ms_os_strides,
+        lse_gs_ms_lengths,
+        d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides
+        QKVElementOp{},
+        QKVElementOp{},
+        Scale{alpha},
+        QKVElementOp{},
+        YElementOp{},
+        param.dropout_prob,
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
-
-  float alpha = param.scale;
-
-  auto op = DeviceOpInstance{};
-  auto invoker = op.MakeInvoker();
-
-  auto arg_ptr = op.MakeArgumentPointer(
-      param.q_ptr,
-      param.k_ptr,
-      nullptr, // p_z_grid
-      param.v_ptr,
-      param.out_ptr,
-      param.logsumexp_ptr,
-      param.grad_out_ptr,
-      param.grad_q_ptr,
-      param.grad_k_ptr,
-      param.grad_v_ptr,
-      param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-      nullptr, // p_acc1_bias
-      param.bias_has_grad ? param.grad_bias_ptr : nullptr,
-      nullptr,
-      q_gs_ms_ks_lengths, // q, dQ should have same shape
-      q_gs_ms_ks_strides,
-      k_gs_ns_ks_lengths, // k, dK should have same shape
-      k_gs_ns_ks_strides,
-      {1, 1, 1, 1}, // z_gs_ms_ns_lengths
-      {0, 0, 0, 0}, // z_gs_ms_ns_strides
-      v_gs_os_ns_lengths, // v, dV should have same shape
-      v_gs_os_ns_strides,
-      y_gs_ms_os_lengths, // y, dY should have same shape
-      y_gs_ms_os_strides,
-      lse_gs_ms_lengths,
-      d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-      d_gs_ms_ns_strides,
-      {}, // acc1_biases_gs_ms_os_lengths
-      {}, // acc1_biases_gs_ms_os_strides
-      QKVElementOp{},
-      QKVElementOp{},
-      Scale{alpha},
-      QKVElementOp{},
-      YElementOp{},
-      param.dropout_prob,
-      std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-  if (!op.IsSupportedArgument(arg_ptr.get())) {
-    std::ostringstream ostr;
-    ostr << op.GetTypeString() << " does not support this problem";
-
-    throw std::runtime_error(ostr.str());
-  }
-
-  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 9d55a2d6e..8f23dc9b3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -6,24 +6,24 @@
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
           param, stream);
     else
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
           param, stream);
     else
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
           param, stream);
     else
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(
+      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index 77dd96de4..dd77a559a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -6,24 +6,24 @@
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
           param, stream);
     else
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
           param, stream);
     else
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
           param, stream);
     else
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index e6015c6bc..b58e1443b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -13,9 +13,7 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void batched_forward_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream) {
+struct batched_forward_masktype_attnbias_dispatched {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
   using GemmDataType = scalar_t;
@@ -59,194 +57,197 @@ void batched_forward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
-  // Tunables
-  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
-
-  using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
-          64, // Gemm1NPerBlock
-          32, // Gemm1KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
-          2, // Gemm1NXdlPerWave
-          1, // DropoutStep
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-          S<16, 16, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
-          2, // CShuffleNXdlPerWavePerShuffle
-          S<1,
-            32,
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    // Tunables
+    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
+    using DeviceOpInstance = ck::tensor_operation::device::
+        DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            GemmDataType,
+            ZDataType,
+            LSEDataType,
+            Acc0BiasDataType,
+            Acc1BiasDataType,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecA,
+            TensorSpecB0,
+            TensorSpecB1,
+            TensorSpecC,
             1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          4,
-          MaskingSpec, // MaskingSpecialization
-          Deterministic>;
-
-  std::vector<ck::index_t> a_gs_ms_ks_lengths{
-      param.B, param.num_heads, param.M, param.K};
-  std::vector<ck::index_t> a_gs_ms_ks_strides{
-      param.q_strides[0],
-      param.q_strides[2],
-      param.q_strides[1],
-      param.q_strides[3]};
-
-  std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-      param.B, param.num_heads, param.N, param.K};
-  std::vector<ck::index_t> b0_gs_ns_ks_strides{
-      param.k_strides[0],
-      param.k_strides[2],
-      param.k_strides[1],
-      param.k_strides[3]};
-
-  // to be changed to b1_gs_ns_os_lengths
-  std::vector<ck::index_t> b1_gs_os_ns_lengths{
-      param.B, param.num_heads, param.Kv, param.N};
-  std::vector<ck::index_t> b1_gs_os_ns_strides{
-      param.v_strides[0],
-      param.v_strides[2],
-      param.v_strides[3],
-      param.v_strides[1]};
-
-  std::vector<ck::index_t> c_gs_ms_os_lengths{
-      param.B, param.num_heads, param.M, param.Kv};
-  std::vector<ck::index_t> c_gs_ms_os_strides{
-      param.out_strides[0],
-      param.out_strides[2],
-      param.out_strides[1],
-      param.out_strides[3]};
-
-  std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.num_heads, param.M};
-
-  std::vector<ck::index_t> d_gs_ms_ns_lengths;
-  std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-  if constexpr (has_attn_bias) {
-    d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
-    d_gs_ms_ns_strides = {
-        param.attn_bias_strides[0],
-        param.attn_bias_strides[1],
-        param.attn_bias_strides[2],
-        param.attn_bias_strides[3]};
-  } else {
-    d_gs_ms_ns_lengths = {1, 1, 1, 1};
-    d_gs_ms_ns_strides = {0, 0, 0, 0};
+            256,
+            128, // MPerBlock
+            128, // NPerBlock
+            32, // KPerBlock
+            64, // Gemm1NPerBlock
+            32, // Gemm1KPerBlock
+            8, // AK1
+            8, // BK1
+            2, // B1K1
+            32, // MPerXDL
+            32, // NPerXDL
+            1, // MXdlPerWave
+            4, // NXdlPerWave
+            2, // Gemm1NXdlPerWave
+            1, // DropoutStep
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+            S<16, 16, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            2,
+            false,
+            1, // CShuffleMXdlPerWavePerShuffle
+            2, // CShuffleNXdlPerWavePerShuffle
+            S<1,
+              32,
+              1,
+              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            4,
+            MaskingSpec, // MaskingSpecialization
+            Deterministic>;
+
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{
+        param.B, param.num_heads, param.M, param.K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        param.q_strides[0],
+        param.q_strides[2],
+        param.q_strides[1],
+        param.q_strides[3]};
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
+        param.B, param.num_heads, param.N, param.K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        param.k_strides[0],
+        param.k_strides[2],
+        param.k_strides[1],
+        param.k_strides[3]};
+
+    // to be changed to b1_gs_ns_os_lengths
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{
+        param.B, param.num_heads, param.Kv, param.N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        param.v_strides[0],
+        param.v_strides[2],
+        param.v_strides[3],
+        param.v_strides[1]};
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{
+        param.B, param.num_heads, param.M, param.Kv};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        param.out_strides[0],
+        param.out_strides[2],
+        param.out_strides[1],
+        param.out_strides[3]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{
+        param.B, param.num_heads, param.M};
+
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+      d_gs_ms_ns_strides = {
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    };
+
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptr,
+        param.k_ptr,
+        param.v_ptr,
+        param.out_ptr,
+        nullptr,
+        param.logsumexp_ptr,
+        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+        {}, // p_acc1_biases;
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        {1, 1, 1, 1},
+        {0, 0, 0, 0},
+        lse_gs_ms_lengths,
+        d_gs_ms_ns_lengths,
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op,
+        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+        std::tuple<int64_t, int64_t>(
+            param.philox_seed,
+            param.philox_offset)); // dropout random seed and offset
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
-
-  float alpha = param.scale;
-
-  auto a_element_op = AElementOp{};
-  auto b0_element_op = B0ElementOp{};
-  auto acc0_element_op = Acc0ElementOp{alpha};
-  auto b1_element_op = B1ElementOp{};
-  auto c_element_op = CElementOp{};
-
-  auto op = DeviceOpInstance{};
-  auto invoker = op.MakeInvoker();
-
-  auto arg_ptr = op.MakeArgumentPointer(
-      param.q_ptr,
-      param.k_ptr,
-      param.v_ptr,
-      param.out_ptr,
-      nullptr,
-      param.logsumexp_ptr,
-      param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-      {}, // p_acc1_biases;
-      a_gs_ms_ks_lengths,
-      a_gs_ms_ks_strides,
-      b0_gs_ns_ks_lengths,
-      b0_gs_ns_ks_strides,
-      b1_gs_os_ns_lengths,
-      b1_gs_os_ns_strides,
-      c_gs_ms_os_lengths,
-      c_gs_ms_os_strides,
-      {1, 1, 1, 1},
-      {0, 0, 0, 0},
-      lse_gs_ms_lengths,
-      d_gs_ms_ns_lengths,
-      d_gs_ms_ns_strides,
-      {}, // acc1_biases_gs_ms_os_lengths
-      {}, // acc1_biases_gs_ms_os_strides,
-      a_element_op,
-      b0_element_op,
-      acc0_element_op,
-      b1_element_op,
-      c_element_op,
-      param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-      std::tuple<int64_t, int64_t>(
-          param.philox_seed,
-          param.philox_offset)); // dropout random seed and offset
-
-  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-  if (!op.IsSupportedArgument(arg_ptr.get())) {
-    std::ostringstream ostr;
-
-    ostr << op.GetTypeString() << " does not support this problem";
-
-    throw std::runtime_error(ostr.str());
-  }
-
-  invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 10bf8ee59..7be431c38 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -6,24 +6,24 @@
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
           param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
           param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
           param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(
+      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index ea11d170a..543a2c253 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -6,24 +6,24 @@
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
           param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
           param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
           param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index e442ae8c1..74e0a8a49 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -14,9 +14,7 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void grouped_backward_masktype_attnbias_dispatched(
-    GroupedBackwardParams& param,
-    hipStream_t stream) {
+struct grouped_backward_masktype_attnbias_dispatched {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
   using Scale = ck::tensor_operation::element_wise::Scale;
 
@@ -59,189 +57,192 @@ void grouped_backward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
-  // Tunables
-  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1; // 8
-  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1; // 4
-  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1; // 4
-
-  using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
-          InputDataType,
-          OutputDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          ShuffleDataType,
-          QKVElementOp,
-          QKVElementOp,
-          Scale,
-          QKVElementOp,
-          YElementOp,
-          GemmSpec,
-          TensorSpecQ,
-          TensorSpecK,
-          TensorSpecV,
-          TensorSpecY,
-          1,
-          256,
-          64, // MPerBlock
-          128, // NPerBlock
-          128, // KPerBlock
-          128, // Gemm1NPerBlock
-          32, // Gemm1KPerBlock
-          64, // Gemm2KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          2, // MXdlPerWave
-          1, // NXdlPerWave
-          4, // Gemm1NXdlPerWave
-          1, // Gemm2NXdlPerWave
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // B0BlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-          S<8, 32, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
-          4, // CShuffleNXdlPerWavePerShuffle
-          S<1, 32, 1, 8>,
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          MaskingSpec,
-          Deterministic>;
-
-  std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-  for (std::size_t i = 0; i < param.num_batches; i++) {
-    int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
-    int N = param.host_seqlen_k.empty()
-        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-        : param.host_seqlen_k[i];
-    int K = param.K;
-    int Kv = param.Kv;
-    int G1 = param.num_heads;
-
-    std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1, M, K};
-    std::vector<ck::index_t> q_gs_ms_ks_strides{
-        0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-    std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1, N, K};
-    std::vector<ck::index_t> k_gs_ns_ks_strides{
-        0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-    // to be changed to v_gs_ns_os_lengths
-    std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1, Kv, N};
-    std::vector<ck::index_t> v_gs_os_ns_strides{
-        0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-    std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1, M, Kv};
-    std::vector<ck::index_t> y_gs_ms_os_strides{
-        0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
-    std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {1, G1, M, N};
-      d_gs_ms_ns_strides = {
-          0,
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2]};
-
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
-    };
-
-    problem_descs.push_back({
-        q_gs_ms_ks_lengths, // q, dQ should have same shape
-        q_gs_ms_ks_strides,
-        k_gs_ns_ks_lengths, // k, dK should have same shape
-        k_gs_ns_ks_strides,
-        {1, 1, 1, 1},
-        {0, 0, 0, 0},
-        v_gs_os_ns_lengths, // v, dV should have same shape
-        v_gs_os_ns_strides,
-        y_gs_ms_os_lengths, // y, dY should have same shape
-        y_gs_ms_os_strides,
-        lse_gs_ms_lengths,
-        lse_gs_ms_strides,
-        d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides
-    });
-  }
-
-  float alpha = param.scale;
-
-  auto op = DeviceOpInstance{};
-  auto invoker = op.MakeInvoker();
-
-  auto arg_ptr = op.MakeArgumentPointer(
-      param.q_ptrs,
-      param.k_ptrs,
-      param.randvals_ptrs,
-      param.v_ptrs,
-      param.out_ptrs,
-      param.logsumexp_ptrs,
-      param.grad_out_ptrs,
-      param.grad_q_ptrs,
-      param.grad_k_ptrs,
-      param.grad_v_ptrs,
-      param.attn_bias_ptrs,
-      {}, // p_acc1_bias_vec;
-      param.grad_bias_ptrs,
-      {},
-      problem_descs,
-      QKVElementOp{},
-      QKVElementOp{},
-      Scale{alpha},
-      QKVElementOp{},
-      YElementOp{},
-      param.dropout_prob,
-      std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-  SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-  if (!op.IsSupportedArgument(arg_ptr.get())) {
-    std::ostringstream ostr;
-    ostr << op.GetTypeString() << " does not support this problem";
-
-    throw std::runtime_error(ostr.str());
-  }
-
-  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
+    // Tunables
+    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1; // 8
+    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1; // 4
+    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1; // 4
+
+    using DeviceOpInstance = ck::tensor_operation::device::
+        DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            InputDataType,
+            OutputDataType,
+            GemmDataType,
+            ZDataType,
+            LSEDataType,
+            Acc0BiasDataType,
+            Acc1BiasDataType,
+            AccDataType,
+            ShuffleDataType,
+            QKVElementOp,
+            QKVElementOp,
+            Scale,
+            QKVElementOp,
+            YElementOp,
+            GemmSpec,
+            TensorSpecQ,
+            TensorSpecK,
+            TensorSpecV,
+            TensorSpecY,
+            1,
+            256,
+            64, // MPerBlock
+            128, // NPerBlock
+            128, // KPerBlock
+            128, // Gemm1NPerBlock
+            32, // Gemm1KPerBlock
+            64, // Gemm2KPerBlock
+            8, // AK1
+            8, // BK1
+            2, // B1K1
+            32, // MPerXDL
+            32, // NPerXDL
+            2, // MXdlPerWave
+            1, // NXdlPerWave
+            4, // Gemm1NXdlPerWave
+            1, // Gemm2NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            S<4, 64, 1>, // B0BlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            2,
+            false,
+            1, // CShuffleMXdlPerWavePerShuffle
+            4, // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            MaskingSpec,
+            Deterministic>;
+
+    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+    for (std::size_t i = 0; i < param.num_batches; i++) {
+      int M =
+          param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
+      int N = param.host_seqlen_k.empty()
+          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+          : param.host_seqlen_k[i];
+      int K = param.K;
+      int Kv = param.Kv;
+      int G1 = param.num_heads;
+
+      std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1, M, K};
+      std::vector<ck::index_t> q_gs_ms_ks_strides{
+          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+      std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1, N, K};
+      std::vector<ck::index_t> k_gs_ns_ks_strides{
+          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+      // to be changed to v_gs_ns_os_lengths
+      std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1, Kv, N};
+      std::vector<ck::index_t> v_gs_os_ns_strides{
+          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+      std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1, M, Kv};
+      std::vector<ck::index_t> y_gs_ms_os_strides{
+          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+      std::vector<ck::index_t> d_gs_ms_ns_lengths;
+      std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+      if constexpr (has_attn_bias) {
+        d_gs_ms_ns_lengths = {1, G1, M, N};
+        d_gs_ms_ns_strides = {
+            0,
+            param.attn_bias_strides[0],
+            param.attn_bias_strides[1],
+            param.attn_bias_strides[2]};
+
+      } else {
+        d_gs_ms_ns_lengths = {1, 1, 1, 1};
+        d_gs_ms_ns_strides = {0, 0, 0, 0};
+      };
+
+      problem_descs.push_back({
+          q_gs_ms_ks_lengths, // q, dQ should have same shape
+          q_gs_ms_ks_strides,
+          k_gs_ns_ks_lengths, // k, dK should have same shape
+          k_gs_ns_ks_strides,
+          {1, 1, 1, 1},
+          {0, 0, 0, 0},
+          v_gs_os_ns_lengths, // v, dV should have same shape
+          v_gs_os_ns_strides,
+          y_gs_ms_os_lengths, // y, dY should have same shape
+          y_gs_ms_os_strides,
+          lse_gs_ms_lengths,
+          lse_gs_ms_strides,
+          d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
+          d_gs_ms_ns_strides,
+          {}, // acc1_biases_gs_ms_os_lengths
+          {}, // acc1_biases_gs_ms_os_strides
+      });
+    }
+
+    float alpha = param.scale;
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptrs,
+        param.k_ptrs,
+        param.randvals_ptrs,
+        param.v_ptrs,
+        param.out_ptrs,
+        param.logsumexp_ptrs,
+        param.grad_out_ptrs,
+        param.grad_q_ptrs,
+        param.grad_k_ptrs,
+        param.grad_v_ptrs,
+        param.attn_bias_ptrs,
+        {}, // p_acc1_bias_vec;
+        param.grad_bias_ptrs,
+        {},
+        problem_descs,
+        QKVElementOp{},
+        QKVElementOp{},
+        Scale{alpha},
+        QKVElementOp{},
+        YElementOp{},
+        param.dropout_prob,
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index dbee4f9e0..5a9c50ba5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -6,25 +6,25 @@
 void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
           param, stream);
     else
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
           param, stream);
     else
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
           param, stream);
 
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
           param, stream);
     else
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index dd0c0f1b8..450632bd3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -6,24 +6,24 @@
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
           param, stream);
     else
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
           param, stream);
     else
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
           param, stream);
     else
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 4ce28c964..999664727 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -14,9 +14,7 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void grouped_forward_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream) {
+struct grouped_forward_masktype_attnbias_dispatched {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
   using GemmDataType = scalar_t;
@@ -60,196 +58,198 @@ void grouped_forward_masktype_attnbias_dispatched(
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = false;
 
-  // Tunables
-  static constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-  static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
-
-  using DeviceOpInstance = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
-          128, // Gemm1NPerBlock
-          32, // Gemm1KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
-          4, // Gemm1NXdlPerWave
-          1, // DropoutStep
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          ABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          Acc0BiasTransferSrcScalarPerVector,
-          S<8, 32, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
-          2, // CShuffleNXdlPerWavePerShuffle
-          S<1,
-            32,
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    // Tunables
+    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
+    using DeviceOpInstance = ck::tensor_operation::device::
+        DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            GemmDataType,
+            ZDataType,
+            LSEDataType,
+            Acc0BiasDataType,
+            Acc1BiasDataType,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecA,
+            TensorSpecB0,
+            TensorSpecB1,
+            TensorSpecC,
             1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-          1,
-          MaskingSpec, // MaskingSpecialization
-          Deterministic>;
-
-  std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-  for (std::size_t i = 0; i < param.num_batches; i++) {
-    int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-    int N = param.host_seqlen_k.empty()
-        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-        : param.host_seqlen_k[i];
-    int K = param.K;
-    int Kv = param.Kv;
-    int G1 = param.num_heads;
-
-    std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
-    std::vector<ck::index_t> a_gs_ms_ks_strides{
-        0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-    std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1, N, K};
-    std::vector<ck::index_t> b0_gs_ns_ks_strides{
-        0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-    // to be changed to b1_gs_ns_os_lengths
-    std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1, Kv, N};
-    std::vector<ck::index_t> b1_gs_os_ns_strides{
-        0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-    std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1, M, Kv};
-    std::vector<ck::index_t> c_gs_ms_os_strides{
-        0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
-    std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {1, G1, M, N};
-      d_gs_ms_ns_strides = {
-          0,
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2]};
-
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
-    };
-
-    problem_descs.push_back(
-        {a_gs_ms_ks_lengths,
-         a_gs_ms_ks_strides,
-         b0_gs_ns_ks_lengths,
-         b0_gs_ns_ks_strides,
-         b1_gs_os_ns_lengths,
-         b1_gs_os_ns_strides,
-         c_gs_ms_os_lengths,
-         c_gs_ms_os_strides,
-         {1, 1, 1, 1},
-         {0, 0, 0, 0},
-         lse_gs_ms_lengths,
-         lse_gs_ms_strides,
-         d_gs_ms_ns_lengths,
-         d_gs_ms_ns_strides,
-         {}, // acc1_bias_gs_ms_os_lengths
-         {}}); // acc1_bias_gs_ms_os_strides
-  }
-
-  float alpha = param.scale;
-
-  auto a_element_op = AElementOp{};
-  auto b0_element_op = B0ElementOp{};
-  auto acc0_element_op = Acc0ElementOp{alpha};
-  auto b1_element_op = B1ElementOp{};
-  auto c_element_op = CElementOp{};
-
-  auto op = DeviceOpInstance{};
-  auto invoker = op.MakeInvoker();
-
-  auto arg_ptr = op.MakeArgumentPointer(
-      param.q_ptrs,
-      param.k_ptrs,
-      param.v_ptrs,
-      param.out_ptrs,
-      param.randvals_ptrs,
-      param.logsumexp_ptrs,
-      param.attn_bias_ptrs,
-      {}, // p_acc1_biases
-      problem_descs,
-      a_element_op,
-      b0_element_op,
-      acc0_element_op,
-      b1_element_op,
-      c_element_op,
-      param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-      std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-  auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-  SimpleDeviceMem workspace(sizeInBytes);
-
-  op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-  if (!op.IsSupportedArgument(arg_ptr.get())) {
-    std::ostringstream ostr;
-
-    ostr << op.GetTypeString() << " does not support this problem";
-
-    throw std::runtime_error(ostr.str());
-  }
-
-  (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+            256,
+            128, // MPerBlock
+            128, // NPerBlock
+            32, // KPerBlock
+            128, // Gemm1NPerBlock
+            32, // Gemm1KPerBlock
+            8, // AK1
+            8, // BK1
+            2, // B1K1
+            32, // MPerXDL
+            32, // NPerXDL
+            1, // MXdlPerWave
+            4, // NXdlPerWave
+            4, // Gemm1NXdlPerWave
+            1, // DropoutStep
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            Acc0BiasTransferSrcScalarPerVector,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            2,
+            false,
+            1, // CShuffleMXdlPerWavePerShuffle
+            2, // CShuffleNXdlPerWavePerShuffle
+            S<1,
+              32,
+              1,
+              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            1,
+            MaskingSpec, // MaskingSpecialization
+            Deterministic>;
+
+    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+    for (std::size_t i = 0; i < param.num_batches; i++) {
+      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+      int N = param.host_seqlen_k.empty()
+          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+          : param.host_seqlen_k[i];
+      int K = param.K;
+      int Kv = param.Kv;
+      int G1 = param.num_heads;
+
+      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
+      std::vector<ck::index_t> a_gs_ms_ks_strides{
+          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1, N, K};
+      std::vector<ck::index_t> b0_gs_ns_ks_strides{
+          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+      // to be changed to b1_gs_ns_os_lengths
+      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1, Kv, N};
+      std::vector<ck::index_t> b1_gs_os_ns_strides{
+          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1, M, Kv};
+      std::vector<ck::index_t> c_gs_ms_os_strides{
+          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+      std::vector<ck::index_t> d_gs_ms_ns_lengths;
+      std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+      if constexpr (has_attn_bias) {
+        d_gs_ms_ns_lengths = {1, G1, M, N};
+        d_gs_ms_ns_strides = {
+            0,
+            param.attn_bias_strides[0],
+            param.attn_bias_strides[1],
+            param.attn_bias_strides[2]};
+
+      } else {
+        d_gs_ms_ns_lengths = {1, 1, 1, 1};
+        d_gs_ms_ns_strides = {0, 0, 0, 0};
+      };
+
+      problem_descs.push_back(
+          {a_gs_ms_ks_lengths,
+           a_gs_ms_ks_strides,
+           b0_gs_ns_ks_lengths,
+           b0_gs_ns_ks_strides,
+           b1_gs_os_ns_lengths,
+           b1_gs_os_ns_strides,
+           c_gs_ms_os_lengths,
+           c_gs_ms_os_strides,
+           {1, 1, 1, 1},
+           {0, 0, 0, 0},
+           lse_gs_ms_lengths,
+           lse_gs_ms_strides,
+           d_gs_ms_ns_lengths,
+           d_gs_ms_ns_strides,
+           {}, // acc1_bias_gs_ms_os_lengths
+           {}}); // acc1_bias_gs_ms_os_strides
+    }
+
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptrs,
+        param.k_ptrs,
+        param.v_ptrs,
+        param.out_ptrs,
+        param.randvals_ptrs,
+        param.logsumexp_ptrs,
+        param.attn_bias_ptrs,
+        {}, // p_acc1_biases
+        problem_descs,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op,
+        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
+
+    SimpleDeviceMem workspace(sizeInBytes);
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index 161818a39..e459d16d9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -6,24 +6,24 @@
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(
+      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index 592bc89e4..cadc30b4b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -6,24 +6,24 @@
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   if (param.custom_mask_type == 0) {
     if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 1) {
     if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
           param, stream);
   } else if (param.custom_mask_type == 2) {
     if (param.has_attn_bias)
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(
+      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
           param, stream);
   } else
     throw std::runtime_error("Invalid custom_mask_type value");

From cbb4705daf3e9b2389e7a0c2a658dc45d1dd56fe Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 18:56:28 +0000
Subject: [PATCH 079/641] Change to codes structure for selecting device-op
 instances according to run-time parameters

---
 .../csrc/attention/hip_fmha/ck_fmha_batched_backward.h    | 7 +++++++
 .../csrc/attention/hip_fmha/ck_fmha_batched_forward.h     | 5 +++++
 .../csrc/attention/hip_fmha/ck_fmha_grouped_backward.h    | 8 ++++++++
 .../csrc/attention/hip_fmha/ck_fmha_grouped_forward.h     | 5 +++++
 4 files changed, 25 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index f87e3fda3..581c8264e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -134,6 +134,13 @@ struct batched_backward_masktype_attnbias_dispatched {
             MaskingSpec,
             Deterministic>;
 
+    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(
+      BatchedBackwardParams& param,
+      hipStream_t stream) {
     std::vector<ck::index_t> q_gs_ms_ks_lengths{
         param.B, param.num_heads, param.M, param.K};
     std::vector<ck::index_t> q_gs_ms_ks_strides{
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index b58e1443b..16d972f91 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -140,6 +140,11 @@ struct batched_forward_masktype_attnbias_dispatched {
             MaskingSpec, // MaskingSpecialization
             Deterministic>;
 
+    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
     std::vector<ck::index_t> a_gs_ms_ks_lengths{
         param.B, param.num_heads, param.M, param.K};
     std::vector<ck::index_t> a_gs_ms_ks_strides{
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 74e0a8a49..5f62593f4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -135,6 +135,14 @@ struct grouped_backward_masktype_attnbias_dispatched {
             MaskingSpec,
             Deterministic>;
 
+    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(
+      GroupedBackwardParams& param,
+      hipStream_t stream) {
+    // Tunables
     std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
 
     for (std::size_t i = 0; i < param.num_batches; i++) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 999664727..8849de82d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -141,6 +141,11 @@ struct grouped_forward_masktype_attnbias_dispatched {
             MaskingSpec, // MaskingSpecialization
             Deterministic>;
 
+    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
     std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
 
     for (std::size_t i = 0; i < param.num_batches; i++) {

From e26535f24cf4572bf61ff26fc17ffad7ff4b7387 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 23:14:48 +0000
Subject: [PATCH 080/641] Use different instances according to the head-dim
 sizes in batched backward

---
 .../hip_fmha/ck_fmha_batched_backward.h       | 287 +++++++++++++-----
 1 file changed, 210 insertions(+), 77 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 581c8264e..f339691a7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -7,6 +7,7 @@
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
 #include "ck_fmha_op_helper.h"
@@ -62,79 +63,215 @@ struct batched_backward_masktype_attnbias_dispatched {
     constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
     constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
 
-    using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-            NumDimG,
-            NumDimM,
-            NumDimN,
-            NumDimK,
-            NumDimO,
-            InputDataType,
-            OutputDataType,
-            GemmDataType,
-            ZDataType,
-            LSEDataType,
-            Acc0BiasDataType,
-            Acc1BiasDataType,
-            AccDataType,
-            ShuffleDataType,
-            QKVElementOp,
-            QKVElementOp,
-            Scale,
-            QKVElementOp,
-            YElementOp,
-            GemmSpec,
-            TensorSpecQ,
-            TensorSpecK,
-            TensorSpecV,
-            TensorSpecY,
-            1,
-            256,
-            64, // MPerBlock
-            128, // NPerBlock
-            128, // KPerBlock
-            128, // Gemm1NPerBlock
-            32, // Gemm1KPerBlock
-            64, // Gemm2KPerBlock
-            8, // AK1
-            8, // BK1
-            2, // A1K1
-            32, // MPerXDL
-            32, // NPerXDL
-            2, // MXdlPerWave
-            1, // NXdlPerWave
-            4, // Gemm1NXdlPerWave
-            1, // Gemm2NXdlPerWave
-            S<4, 64, 1>, // ABlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            S<4, 64, 1>, // B0BlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-            S<8, 32, 1>, // B1BlockTransfer
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            2,
-            false,
-            1, // CShuffleMXdlPerWavePerShuffle
-            4, // CShuffleNXdlPerWavePerShuffle
-            S<1, 32, 1, 8>,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            MaskingSpec,
-            Deterministic>;
-
-    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    if (param.K <= 32 && param.Kv <= 32) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              InputDataType,
+              OutputDataType,
+              GemmDataType,
+              ZDataType,
+              LSEDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              ShuffleDataType,
+              QKVElementOp,
+              QKVElementOp,
+              Scale,
+              QKVElementOp,
+              YElementOp,
+              GemmSpec,
+              TensorSpecQ,
+              TensorSpecK,
+              TensorSpecV,
+              TensorSpecY,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              32, // Gemm1NPerBlock
+              32, // Gemm1KperBlock
+              64, // Gemm2KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              4, // MXdlPerWave
+              1, // NXdlPerWave
+              1, // Gemm1NXdlPerWave
+              1, // Gemm2NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              1,
+              1,
+              S<1, 64, 1, 4>,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec,
+              Deterministic>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else if (param.K <= 64 && param.Kv <= 64) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              InputDataType,
+              OutputDataType,
+              GemmDataType,
+              ZDataType,
+              LSEDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              ShuffleDataType,
+              QKVElementOp,
+              QKVElementOp,
+              Scale,
+              QKVElementOp,
+              YElementOp,
+              GemmSpec,
+              TensorSpecQ,
+              TensorSpecK,
+              TensorSpecV,
+              TensorSpecY,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              64, // KPerBlock
+              64, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              32, // Gemm2KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              4, // MXdlPerWave
+              1, // NXdlPerWave
+              2, // Gemm1NXdlPerWave
+              1, // Gemm2NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              1,
+              2,
+              S<1, 32, 1, 8>,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec,
+              Deterministic>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              InputDataType,
+              OutputDataType,
+              GemmDataType,
+              ZDataType,
+              LSEDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              ShuffleDataType,
+              QKVElementOp,
+              QKVElementOp,
+              Scale,
+              QKVElementOp,
+              YElementOp,
+              GemmSpec,
+              TensorSpecQ,
+              TensorSpecK,
+              TensorSpecV,
+              TensorSpecY,
+              1,
+              256,
+              64, // MPerBlock
+              128, // NPerBlock
+              128, // KPerBlock
+              128, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              64, // Gemm2KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // A1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              2, // MXdlPerWave
+              1, // NXdlPerWave
+              4, // Gemm1NXdlPerWave
+              1, // Gemm2NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // B0BlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              S<8, 32, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              4, // CShuffleNXdlPerWavePerShuffle
+              S<1, 32, 1, 8>,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec,
+              Deterministic>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    };
   };
 
   template <typename DeviceOpInstance>
@@ -234,10 +371,6 @@ struct batched_backward_masktype_attnbias_dispatched {
         param.dropout_prob,
         std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
 
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
     if (!op.IsSupportedArgument(arg_ptr.get())) {
       std::ostringstream ostr;
       ostr << op.GetTypeString() << " does not support this problem";

From 8836ab059da4b9300a0856986a7c7090f2c07b02 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 23:15:57 +0000
Subject: [PATCH 081/641] Update to test_mem_eff_attention_ck.py for
 test_dropout_backward_ck

---
 tests/test_mem_eff_attention_ck.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 49ab783c0..fdfeb40e9 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -946,8 +946,6 @@ def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias
     assert all(p_values > p_val_tol)
 
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
-    if dtype is torch.bfloat16 and compute_capability < (8, 0):
-        pytest.skip("bf16 requires Sm80")
     if not op.is_available():
         pytest.skip()
 
@@ -1034,8 +1032,11 @@ def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize("kv_len", [3, 248, 256])
 @pytest.mark.parametrize("q_len", [3, 248, 256])
-@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
+@pytest.mark.parametrize("dt", ["f16", "bf16"])
 def test_dropout_backward_ck(dt, q_len, kv_len, batch_size, k, p):
+    if k > 128:
+        pytest.skip("head-dim size bigger than 128 is not supported by CK-FlashAttention")
+
     _test_dropout_backward(
         q_len,
         kv_len,

From 4470458fc8779877911364af22983ede358224d2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 9 Oct 2023 23:16:56 +0000
Subject: [PATCH 082/641] Add test_ck_7.py for temperary debugging of
 test_backward

---
 tests/test_ck_7.py | 868 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 868 insertions(+)
 create mode 100644 tests/test_ck_7.py

diff --git a/tests/test_ck_7.py b/tests/test_ck_7.py
new file mode 100644
index 000000000..00a42ead0
--- /dev/null
+++ b/tests/test_ck_7.py
@@ -0,0 +1,868 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_types = [torch.float16, torch.bfloat16]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
+    fmha.ck.BwOp,
+]
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    for B in op._TEST_BATCH_SIZES:
+        for Mq in [32, 256]:
+            for Mkv in [32, 64, 256, 1024]:
+                for K in op._TEST_K:
+                    shapes.append((B, Mq, Mkv, 1, K, K))
+        Mq = 256
+        Mkv = 128
+        K = 32
+        H = 1
+        # Weird values of parameters
+        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
+            shapes.append((B, M, Mkv, H, K, K))
+            shapes.append((B, Mq, M, H, K, K))
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
+            if _K <= op.SUPPORTED_MAX_K:
+                shapes.append((B, Mq, Mkv, H, _K, _K))
+        # Different value for K / Kv
+        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+            for _K in [32, 36, 64, 256 + 8]:
+                shapes.append((B, Mq, Mkv, H, K, _K))
+                shapes.append((B, Mq, Mkv, H, _K, K))
+        # Exotic sizes
+        for K in op._TEST_K:
+            shapes.append((B, 16, 1024, H, K, K))
+            shapes.append((B, 1024, 16, H, K, K))
+        # Some number of heads
+        for H in [3, 5, 12]:
+            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Filter-out not supported shapes
+    shapes = [
+        shape
+        for shape in shapes
+        if len(
+            op.shape_not_supported_reasons(
+                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
+            )
+        )
+        == 0
+    ]
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        found_count = 0
+        while found_count < 20:
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
+                continue
+            found_count += 1
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def make_id(op, device, dtype, bias_type, *shape):
+    return (
+        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+        f"-{'-'.join([str(s) for s in shape])}"
+    )
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+    return {
+        "argvalues": combination,
+        "ids": [make_id(*c) for c in combination],
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
+)
+
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor.float()
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
+            # make sure it also works if the first columns are partially masked out
+            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
+    tensor_with_grad: Optional[torch.Tensor] = None
+    if isinstance(attn_bias, torch.Tensor):
+        tensor_with_grad = attn_bias
+    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+        tensor_with_grad = attn_bias._bias
+    if tensor_with_grad is not None:
+        grad = tensor_with_grad.grad
+        if clear:
+            tensor_with_grad.grad = None
+        return grad
+    return None
+
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("packed", [False, True])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_forward(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed,
+    fmt,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    if kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
+
+    if packed and not (k == kv and q_len == kv_len):
+        pytest.skip(
+            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
+        )
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
+        pytest.skip("BMK incompatible with this bias")
+
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+    )
+
+    if packed:
+        c = torch.stack([query, key, value], 2)
+        if fmt == "BMK":
+            # bm3hk -> 3bhmk -> 3Bmk
+            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            query, key, value = c[0], c[1], c[2]
+            # Re-create bias in the right format
+            attn_bias = create_attn_bias(
+                bias_type=bias_type,
+                batch_size=batch_size,
+                num_heads=h,
+                q_len=q_len,
+                kv_len=kv_len,
+                device=device,
+                dtype=dtype,
+                requires_grad=False,
+                fmt=fmt,
+                op=op,
+            )
+        else:
+            # bm3hk -> 3 x bmhk
+            query, key, value = xformers.ops.unbind(c, 2)
+        assert not query.is_contiguous()
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+
+@pytest.mark.parametrize("k_len", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("kv_len", [128, 512])
+@pytest.mark.parametrize("q_len", [128, 512])
+@pytest.mark.parametrize("device", [torch.device("cuda")])
+@pytest.mark.parametrize("dtype", _types)
+def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
+    scale = 3
+    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
+    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
+    # this should be equivalent to the average over value
+    ref = value.mean(1, keepdim=True).expand_as(query)
+
+    if dtype is torch.float16:
+        assert_allclose(out, ref, atol=1e-5)
+    else:
+        assert_allclose(out, ref, atol=1e-2)
+
+def _block_diag_reshape_lse(
+    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
+) -> torch.Tensor:
+    """LSE can be padded, let's remove the padding"""
+    parts = []
+    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
+        parts.append(slice[:, : end - start])
+    return torch.cat(parts, dim=1).unsqueeze(1)
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+
+    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+        attn_bias=attn_bias,
+    )
+    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            tensor_bias = attn_bias.materialize(
+                (query.shape[0], 1, query.shape[1], key.shape[1]),
+                device=query.device,
+                dtype=torch.float32,
+            )
+        else:
+            assert isinstance(attn_bias, torch.Tensor)
+            tensor_bias = attn_bias
+        if tensor_bias.ndim == 4:
+            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
+        attn = attn + tensor_bias.float()
+    ref_lse = attn.logsumexp(-1)
+    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
+        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
+    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("grad_out_contiguous", [True])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_backward(
+    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    grad_out_contiguous,
+    fmt,
+):
+    (
+        op_bw,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    if k > 128 or kv > 128:
+        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention-1")
+
+    if k % 8 != 0 or kv % 8 != 0:
+        pytest.skip("head-dim length must be an even value for CK-FlashAttention-1")
+
+    ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
+    if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask and q_len <= kv_len:
+        pytest.skip("BlockDiagonalCausalFromBottomRightMask requires kv_len bigger than q_len")
+
+    if k != kv:
+        pytest.skip("k same as kv is not well tested by CK-FlashAttention-1")
+
+    ## attn_bias_requires_grad = (
+    ##    random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
+    ##)
+    attn_bias_requires_grad = False
+
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        attn_bias_requires_grad=attn_bias_requires_grad,
+        fmt=fmt,
+    )
+    op_fw = (
+        sample_random_supported_fw(
+            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
+            seed=q_len * kv + kv_len * k,
+        )
+        if op_bw != fmha.ck.BwOp
+        else fmha.ck.FwOp
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
+        pytest.skip("inputs not supported")
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, op=(op_fw, op_bw)
+    )
+
+    grad_out = torch.ones_like(out)
+    ##if grad_out_contiguous is False:
+    ##    grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
+    ##        None, None, :
+    ##    ].expand_as(out)
+
+    out.backward(grad_out)
+
+    if qkv is None and op_bw == fmha.ck.BwOp:
+        assert query.stride() == query.grad.stride()
+
+    grads = []
+    if qkv is None:
+        grads = [query.grad, key.grad, value.grad]
+        query.grad = None
+        key.grad = None
+        value.grad = None
+    else:
+        grads = [qkv.grad]
+        qkv.grad = None
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
+        if attn_bias_grad is not None:
+            grads.append(attn_bias_grad)
+
+    ref = ref_attention(query, key, value, attn_bias)
+    ref.backward(grad_out)
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        "fw pass",
+        atol=op_fw.ERROR_ATOL[dtype],
+        rtol=op_fw.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+    del out
+    del grad_out
+    del ref
+
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+
+    grads_ref = []
+    grads_name = []
+    if qkv is None:
+        assert isinstance(query.grad, torch.Tensor)
+        assert isinstance(key.grad, torch.Tensor)
+        assert isinstance(value.grad, torch.Tensor)
+        grads_ref = [query.grad, key.grad, value.grad]
+        grads_name = ["query", "key", "value"]
+    else:
+        assert isinstance(qkv.grad, torch.Tensor)
+        grads_ref = [qkv.grad]
+        grads_name = ["qkv"]
+
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias)
+        if attn_bias_grad is not None:
+            grads_ref.append(attn_bias.grad)
+            grads_name.append("bias")
+
+    del query
+    del key
+    del value
+    del qkv
+
+    assert len(grads_ref) == len(
+        grads
+    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
+    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
+        assert_allclose(
+            calc_grad,
+            ref_grad,
+            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
+            atol=atol,
+            rtol=rtol,
+        )
+
+

From 0c4d4794a481b253c2e4b816bc8084f5cb4014b5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 10 Oct 2023 16:34:40 +0000
Subject: [PATCH 083/641] Use different instances according to the head-dim
 sizes in grouped backward

---
 .../hip_fmha/ck_fmha_grouped_backward.h       | 283 +++++++++++++-----
 1 file changed, 210 insertions(+), 73 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 5f62593f4..a93e67082 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -8,6 +8,7 @@
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/sequence.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
 #include "ck_fmha_op_helper.h"
@@ -63,79 +64,215 @@ struct grouped_backward_masktype_attnbias_dispatched {
     constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1; // 4
     constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1; // 4
 
-    using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-            NumDimG,
-            NumDimM,
-            NumDimN,
-            NumDimK,
-            NumDimO,
-            InputDataType,
-            OutputDataType,
-            GemmDataType,
-            ZDataType,
-            LSEDataType,
-            Acc0BiasDataType,
-            Acc1BiasDataType,
-            AccDataType,
-            ShuffleDataType,
-            QKVElementOp,
-            QKVElementOp,
-            Scale,
-            QKVElementOp,
-            YElementOp,
-            GemmSpec,
-            TensorSpecQ,
-            TensorSpecK,
-            TensorSpecV,
-            TensorSpecY,
-            1,
-            256,
-            64, // MPerBlock
-            128, // NPerBlock
-            128, // KPerBlock
-            128, // Gemm1NPerBlock
-            32, // Gemm1KPerBlock
-            64, // Gemm2KPerBlock
-            8, // AK1
-            8, // BK1
-            2, // B1K1
-            32, // MPerXDL
-            32, // NPerXDL
-            2, // MXdlPerWave
-            1, // NXdlPerWave
-            4, // Gemm1NXdlPerWave
-            1, // Gemm2NXdlPerWave
-            S<4, 64, 1>, // ABlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            S<4, 64, 1>, // B0BlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-            S<8, 32, 1>, // B1BlockTransfer
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            2,
-            false,
-            1, // CShuffleMXdlPerWavePerShuffle
-            4, // CShuffleNXdlPerWavePerShuffle
-            S<1, 32, 1, 8>,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            MaskingSpec,
-            Deterministic>;
-
-    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    if (param.K <= 32 && param.Kv <= 32) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              InputDataType,
+              OutputDataType,
+              GemmDataType,
+              ZDataType,
+              LSEDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              ShuffleDataType,
+              QKVElementOp,
+              QKVElementOp,
+              Scale,
+              QKVElementOp,
+              YElementOp,
+              GemmSpec,
+              TensorSpecQ,
+              TensorSpecK,
+              TensorSpecV,
+              TensorSpecY,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              32, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              64, // Gemm2KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              4, // MXdlPerWave
+              1, // NXdlPerWave
+              1, // Gemm1NXdlPerWave
+              1, // Gemm2NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              1,
+              1,
+              S<1, 64, 1, 4>,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec,
+              Deterministic>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else if (param.K <= 64 && param.Kv <= 64) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              InputDataType,
+              OutputDataType,
+              GemmDataType,
+              ZDataType,
+              LSEDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              ShuffleDataType,
+              QKVElementOp,
+              QKVElementOp,
+              Scale,
+              QKVElementOp,
+              YElementOp,
+              GemmSpec,
+              TensorSpecQ,
+              TensorSpecK,
+              TensorSpecV,
+              TensorSpecY,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              64, // KPerBlock
+              64, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              32, // Gemm2KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              4, // MXdlPerWave
+              1, // NXdlPerWave
+              2, // Gemm1NXdlPerWave
+              1, // Gemm2NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              1,
+              2,
+              S<1, 32, 1, 8>,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec,
+              Deterministic>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              InputDataType,
+              OutputDataType,
+              GemmDataType,
+              ZDataType,
+              LSEDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              ShuffleDataType,
+              QKVElementOp,
+              QKVElementOp,
+              Scale,
+              QKVElementOp,
+              YElementOp,
+              GemmSpec,
+              TensorSpecQ,
+              TensorSpecK,
+              TensorSpecV,
+              TensorSpecY,
+              1,
+              256,
+              64, // MPerBlock
+              128, // NPerBlock
+              128, // KPerBlock
+              128, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              64, // Gemm2KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              2, // MXdlPerWave
+              1, // NXdlPerWave
+              4, // Gemm1NXdlPerWave
+              1, // Gemm2NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // B0BlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              S<8, 32, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              4, // CShuffleNXdlPerWavePerShuffle
+              S<1, 32, 1, 8>,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec,
+              Deterministic>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    };
   };
 
   template <typename DeviceOpInstance>

From 20a2535b70990da6d40bede32e13a9c25b8dd403 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 10 Oct 2023 17:02:42 +0000
Subject: [PATCH 084/641] Separate the forward codes into forward and infer in
 C++ extension

---
 .../hip_fmha/attention_forward_generic.cpp    |  49 +++-
 .../hip_fmha/ck_fmha_batched_infer.h          | 258 +++++++++++++++++
 .../hip_fmha/ck_fmha_batched_infer_bp16.cpp   |  30 ++
 .../hip_fmha/ck_fmha_batched_infer_fp16.cpp   |  30 ++
 .../hip_fmha/ck_fmha_grouped_infer.h          | 260 ++++++++++++++++++
 .../hip_fmha/ck_fmha_grouped_infer_bp16.cpp   |  30 ++
 .../hip_fmha/ck_fmha_grouped_infer_fp16.cpp   |  30 ++
 7 files changed, 675 insertions(+), 12 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 166c9806a..ecd50db2e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -30,6 +30,11 @@ extern void grouped_forward_bp16(
     GroupedForwardParams& param,
     hipStream_t stream);
 
+extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
+extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
+extern void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream);
+extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream);
+
 namespace {
 
 /*
@@ -358,23 +363,43 @@ efficient_attention_forward_ck(
 
     set_batched_forward_params(batched_forward_params);
 
-    if (inDataType == at::ScalarType::Half) {
-      batched_forward_fp16(batched_forward_params, stream);
-    } else if (inDataType == at::ScalarType::BFloat16) {
-      batched_forward_bp16(batched_forward_params, stream);
-    } else
-      throw std::runtime_error("input data-type is not supported!");
+    if (!batched_forward_params.use_dropout &&
+        !batched_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        batched_infer_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_infer_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      if (inDataType == at::ScalarType::Half) {
+        batched_forward_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_forward_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    };
   } else { // input is grouped
     GroupedForwardParams grouped_forward_params;
 
     set_grouped_forward_params(grouped_forward_params);
 
-    if (inDataType == at::ScalarType::Half) {
-      grouped_forward_fp16(grouped_forward_params, stream);
-    } else if (inDataType == at::ScalarType::BFloat16) {
-      grouped_forward_bp16(grouped_forward_params, stream);
-    } else
-      throw std::runtime_error("input data-type is not supported!");
+    if (!grouped_forward_params.use_dropout &&
+        !grouped_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_infer_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_infer_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_forward_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_forward_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    };
   };
 
   return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
new file mode 100644
index 000000000..c32734a50
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -0,0 +1,258 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp"
+
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_params.h"
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+struct batched_infer_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr bool Deterministic = false;
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    // Tunables
+    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
+    using DeviceOpInstance = ck::tensor_operation::device::
+        DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            GemmDataType,
+            ZDataType,
+            LSEDataType,
+            Acc0BiasDataType,
+            Acc1BiasDataType,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecA,
+            TensorSpecB0,
+            TensorSpecB1,
+            TensorSpecC,
+            1,
+            256,
+            128, // MPerBlock
+            128, // NPerBlock
+            32, // KPerBlock
+            64, // Gemm1NPerBlock
+            32, // Gemm1KPerBlock
+            8, // AK1
+            8, // BK1
+            2, // B1K1
+            32, // MPerXDL
+            32, // NPerXDL
+            1, // MXdlPerWave
+            4, // NXdlPerWave
+            2, // Gemm1NXdlPerWave
+            1, // DropoutStep
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+            S<16, 16, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            2,
+            false,
+            1, // CShuffleMXdlPerWavePerShuffle
+            2, // CShuffleNXdlPerWavePerShuffle
+            S<1,
+              32,
+              1,
+              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            4,
+            MaskingSpec, // MaskingSpecialization
+            Deterministic>;
+
+    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{
+        param.B, param.num_heads, param.M, param.K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        param.q_strides[0],
+        param.q_strides[2],
+        param.q_strides[1],
+        param.q_strides[3]};
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
+        param.B, param.num_heads, param.N, param.K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        param.k_strides[0],
+        param.k_strides[2],
+        param.k_strides[1],
+        param.k_strides[3]};
+
+    // to be changed to b1_gs_ns_os_lengths
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{
+        param.B, param.num_heads, param.Kv, param.N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        param.v_strides[0],
+        param.v_strides[2],
+        param.v_strides[3],
+        param.v_strides[1]};
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{
+        param.B, param.num_heads, param.M, param.Kv};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        param.out_strides[0],
+        param.out_strides[2],
+        param.out_strides[1],
+        param.out_strides[3]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{
+        param.B, param.num_heads, param.M};
+
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+      d_gs_ms_ns_strides = {
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    };
+
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptr,
+        param.k_ptr,
+        param.v_ptr,
+        param.out_ptr,
+        nullptr,
+        param.logsumexp_ptr,
+        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+        {}, // p_acc1_biases;
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        {1, 1, 1, 1},
+        {0, 0, 0, 0},
+        lse_gs_ms_lengths,
+        d_gs_ms_ns_lengths,
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op,
+        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+        std::tuple<int64_t, int64_t>(
+            param.philox_seed,
+            param.philox_offset)); // dropout random seed and offset
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
new file mode 100644
index 000000000..bd62aebe2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
@@ -0,0 +1,30 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
+          param, stream);
+    else
+      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
+          param, stream);
+    else
+      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
+          param, stream);
+    else
+      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
new file mode 100644
index 000000000..3429c088e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
@@ -0,0 +1,30 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
+          param, stream);
+    else
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
+          param, stream);
+    else
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
+          param, stream);
+    else
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
new file mode 100644
index 000000000..9246a2549
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/sequence.hpp>
+
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_params.h"
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+struct grouped_infer_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr bool Deterministic = false;
+
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    // Tunables
+    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+
+    using DeviceOpInstance = ck::tensor_operation::device::
+        DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            GemmDataType,
+            ZDataType,
+            LSEDataType,
+            Acc0BiasDataType,
+            Acc1BiasDataType,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecA,
+            TensorSpecB0,
+            TensorSpecB1,
+            TensorSpecC,
+            1,
+            256,
+            128, // MPerBlock
+            128, // NPerBlock
+            32, // KPerBlock
+            128, // Gemm1NPerBlock
+            32, // Gemm1KPerBlock
+            8, // AK1
+            8, // BK1
+            2, // B1K1
+            32, // MPerXDL
+            32, // NPerXDL
+            1, // MXdlPerWave
+            4, // NXdlPerWave
+            4, // Gemm1NXdlPerWave
+            1, // DropoutStep
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            8,
+            true,
+            Acc0BiasTransferSrcScalarPerVector,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            2,
+            false,
+            1, // CShuffleMXdlPerWavePerShuffle
+            2, // CShuffleNXdlPerWavePerShuffle
+            S<1,
+              32,
+              1,
+              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            1,
+            MaskingSpec, // MaskingSpecialization
+            Deterministic>;
+
+    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
+    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+    for (std::size_t i = 0; i < param.num_batches; i++) {
+      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+      int N = param.host_seqlen_k.empty()
+          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+          : param.host_seqlen_k[i];
+      int K = param.K;
+      int Kv = param.Kv;
+      int G1 = param.num_heads;
+
+      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
+      std::vector<ck::index_t> a_gs_ms_ks_strides{
+          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1, N, K};
+      std::vector<ck::index_t> b0_gs_ns_ks_strides{
+          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+      // to be changed to b1_gs_ns_os_lengths
+      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1, Kv, N};
+      std::vector<ck::index_t> b1_gs_os_ns_strides{
+          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1, M, Kv};
+      std::vector<ck::index_t> c_gs_ms_os_strides{
+          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+      std::vector<ck::index_t> d_gs_ms_ns_lengths;
+      std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+      if constexpr (has_attn_bias) {
+        d_gs_ms_ns_lengths = {1, G1, M, N};
+        d_gs_ms_ns_strides = {
+            0,
+            param.attn_bias_strides[0],
+            param.attn_bias_strides[1],
+            param.attn_bias_strides[2]};
+
+      } else {
+        d_gs_ms_ns_lengths = {1, 1, 1, 1};
+        d_gs_ms_ns_strides = {0, 0, 0, 0};
+      };
+
+      problem_descs.push_back(
+          {a_gs_ms_ks_lengths,
+           a_gs_ms_ks_strides,
+           b0_gs_ns_ks_lengths,
+           b0_gs_ns_ks_strides,
+           b1_gs_os_ns_lengths,
+           b1_gs_os_ns_strides,
+           c_gs_ms_os_lengths,
+           c_gs_ms_os_strides,
+           {1, 1, 1, 1},
+           {0, 0, 0, 0},
+           lse_gs_ms_lengths,
+           lse_gs_ms_strides,
+           d_gs_ms_ns_lengths,
+           d_gs_ms_ns_strides,
+           {}, // acc1_bias_gs_ms_os_lengths
+           {}}); // acc1_bias_gs_ms_os_strides
+    }
+
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptrs,
+        param.k_ptrs,
+        param.v_ptrs,
+        param.out_ptrs,
+        param.randvals_ptrs,
+        param.logsumexp_ptrs,
+        param.attn_bias_ptrs,
+        {}, // p_acc1_biases
+        problem_descs,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op,
+        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
+
+    SimpleDeviceMem workspace(sizeInBytes);
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
new file mode 100644
index 000000000..d3accc720
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
@@ -0,0 +1,30 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
+          param, stream);
+    else
+      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
+          param, stream);
+    else
+      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
+          param, stream);
+    else
+      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
new file mode 100644
index 000000000..d2e846683
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
@@ -0,0 +1,30 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  if (param.custom_mask_type == 0) {
+    if (param.has_attn_bias)
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
+          param, stream);
+    else
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 1) {
+    if (param.has_attn_bias)
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
+          param, stream);
+    else
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
+          param, stream);
+  } else if (param.custom_mask_type == 2) {
+    if (param.has_attn_bias)
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
+          param, stream);
+    else
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
+          param, stream);
+  } else
+    throw std::runtime_error("Invalid custom_mask_type value");
+};

From 768f8782f679d0ff28de3da9154975cfb2b9f3e1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 10 Oct 2023 17:25:30 +0000
Subject: [PATCH 085/641] Synchronize with latest CK flashAttention which
 removed <Deterministic> in forward kernel

---
 third_party/composable_kernel                              | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h | 4 +---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h   | 4 +---
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h | 4 +---
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h   | 4 +---
 5 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index b23b3d717..3f4eae1db 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit b23b3d717ab17a06c490b70508d18ef7773849a4
+Subproject commit 3f4eae1db4d73cf1692b204425591660cfd421be
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 16d972f91..c144cc5f5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -55,7 +55,6 @@ struct batched_forward_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     // Tunables
@@ -137,8 +136,7 @@ struct batched_forward_masktype_attnbias_dispatched {
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
             B1CShuffleBlockTransferScalarPerVector, // TUNABLE
             4,
-            MaskingSpec, // MaskingSpecialization
-            Deterministic>;
+            MaskingSpec>; // MaskingSpecialization
 
     RunWithDeviceOp<DeviceOpInstance>(param, stream);
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index c32734a50..549fa3898 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -55,7 +55,6 @@ struct batched_infer_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     // Tunables
@@ -137,8 +136,7 @@ struct batched_infer_masktype_attnbias_dispatched {
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
             B1CShuffleBlockTransferScalarPerVector, // TUNABLE
             4,
-            MaskingSpec, // MaskingSpecialization
-            Deterministic>;
+            MaskingSpec>; // MaskingSpecialization
 
     RunWithDeviceOp<DeviceOpInstance>(param, stream);
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 8849de82d..74ebfc5a9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -56,7 +56,6 @@ struct grouped_forward_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     // Tunables
@@ -138,8 +137,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
             B1CShuffleBlockTransferScalarPerVector, // TUNABLE
             1,
-            MaskingSpec, // MaskingSpecialization
-            Deterministic>;
+            MaskingSpec>; // MaskingSpecialization
 
     RunWithDeviceOp<DeviceOpInstance>(param, stream);
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 9246a2549..a8f6ef2c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -56,7 +56,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     // Tunables
@@ -138,8 +137,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
             B1CShuffleBlockTransferScalarPerVector, // TUNABLE
             1,
-            MaskingSpec, // MaskingSpecialization
-            Deterministic>;
+            MaskingSpec>; // MaskingSpecialization
 
     RunWithDeviceOp<DeviceOpInstance>(param, stream);
   };

From 27f54bf5234366041e03b9ded23c85cb8ef0ec30 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 10 Oct 2023 17:47:31 +0000
Subject: [PATCH 086/641] Add torch_check in attention_backward_generic.cpp to
 ensure q/k/v and dq/dk/dv have same sizes/strides

---
 .../attention/hip_fmha/attention_backward_generic.cpp     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index da1a082b2..d21b8b526 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -156,6 +156,14 @@ efficient_attention_backward_ck(
     grad_q.fill_(0);
   }
 
+  // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
+  TORCH_CHECK(query.sizes() == grad_q.sizes());
+  TORCH_CHECK(query.strides() == grad_q.strides());
+  TORCH_CHECK(key.sizes() == grad_k.sizes());
+  TORCH_CHECK(key.strides() == grad_k.strides());
+  TORCH_CHECK(value.sizes() == grad_v.sizes());
+  TORCH_CHECK(value.strides() == grad_v.strides());
+
   const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
 
   if (bias_requires_grad)

From 0946c58c5c03cb1be247fdc6fe6f8243301ff766 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 11 Oct 2023 23:51:06 +0000
Subject: [PATCH 087/641] Tiny fix in attention_backward_generic.cpp

---
 xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index d21b8b526..1a3b16b1e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -152,7 +152,7 @@ efficient_attention_backward_ck(
   } else {
     grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
     grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
-    grad_v = at::empty_strided(value.sizes(), key.strides(), value.options());
+    grad_v = at::empty_strided(value.sizes(), value.strides(), value.options());
     grad_q.fill_(0);
   }
 

From fb3485d3447e1bb8ffc71fb25911a156a73bcf1a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 13 Oct 2023 15:27:51 +0000
Subject: [PATCH 088/641] Use ck infer-only device-op to do hip_fmha inference

---
 .../hip_fmha/ck_fmha_batched_infer.h          | 20 +++--------------
 .../hip_fmha/ck_fmha_grouped_infer.h          | 22 +++----------------
 2 files changed, 6 insertions(+), 36 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 549fa3898..870d1394e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -7,7 +7,7 @@
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp"
 
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
@@ -63,7 +63,7 @@ struct batched_infer_masktype_attnbias_dispatched {
     constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
 
     using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
+        DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
             NumDimG,
             NumDimM,
             NumDimN,
@@ -73,9 +73,6 @@ struct batched_infer_masktype_attnbias_dispatched {
             B0DataType,
             B1DataType,
             CDataType,
-            GemmDataType,
-            ZDataType,
-            LSEDataType,
             Acc0BiasDataType,
             Acc1BiasDataType,
             AccDataType,
@@ -105,7 +102,6 @@ struct batched_infer_masktype_attnbias_dispatched {
             1, // MXdlPerWave
             4, // NXdlPerWave
             2, // Gemm1NXdlPerWave
-            1, // DropoutStep
             S<4, 64, 1>, // ABlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
@@ -135,7 +131,6 @@ struct batched_infer_masktype_attnbias_dispatched {
               1,
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
             B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            4,
             MaskingSpec>; // MaskingSpecialization
 
     RunWithDeviceOp<DeviceOpInstance>(param, stream);
@@ -210,8 +205,6 @@ struct batched_infer_masktype_attnbias_dispatched {
         param.k_ptr,
         param.v_ptr,
         param.out_ptr,
-        nullptr,
-        param.logsumexp_ptr,
         param.has_attn_bias ? param.attn_bias_ptr : nullptr,
         {}, // p_acc1_biases;
         a_gs_ms_ks_lengths,
@@ -222,9 +215,6 @@ struct batched_infer_masktype_attnbias_dispatched {
         b1_gs_os_ns_strides,
         c_gs_ms_os_lengths,
         c_gs_ms_os_strides,
-        {1, 1, 1, 1},
-        {0, 0, 0, 0},
-        lse_gs_ms_lengths,
         d_gs_ms_ns_lengths,
         d_gs_ms_ns_strides,
         {}, // acc1_biases_gs_ms_os_lengths
@@ -233,11 +223,7 @@ struct batched_infer_masktype_attnbias_dispatched {
         b0_element_op,
         acc0_element_op,
         b1_element_op,
-        c_element_op,
-        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-        std::tuple<int64_t, int64_t>(
-            param.philox_seed,
-            param.philox_offset)); // dropout random seed and offset
+        c_element_op);
 
     SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index a8f6ef2c1..321b17cdd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -5,10 +5,10 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/sequence.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp"
 
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
@@ -64,7 +64,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
     constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
 
     using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
+        DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
             NumDimG,
             NumDimM,
             NumDimN,
@@ -74,9 +74,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
             B0DataType,
             B1DataType,
             CDataType,
-            GemmDataType,
-            ZDataType,
-            LSEDataType,
             Acc0BiasDataType,
             Acc1BiasDataType,
             AccDataType,
@@ -106,7 +103,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
             1, // MXdlPerWave
             4, // NXdlPerWave
             4, // Gemm1NXdlPerWave
-            1, // DropoutStep
             S<4, 64, 1>, // ABlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
@@ -136,7 +132,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
               1,
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
             B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            1,
             MaskingSpec>; // MaskingSpecialization
 
     RunWithDeviceOp<DeviceOpInstance>(param, stream);
@@ -172,9 +167,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
       std::vector<ck::index_t> c_gs_ms_os_strides{
           0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
-      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
       std::vector<ck::index_t> d_gs_ms_ns_lengths;
       std::vector<ck::index_t> d_gs_ms_ns_strides;
 
@@ -200,10 +192,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
            b1_gs_os_ns_strides,
            c_gs_ms_os_lengths,
            c_gs_ms_os_strides,
-           {1, 1, 1, 1},
-           {0, 0, 0, 0},
-           lse_gs_ms_lengths,
-           lse_gs_ms_strides,
            d_gs_ms_ns_lengths,
            d_gs_ms_ns_strides,
            {}, // acc1_bias_gs_ms_os_lengths
@@ -226,8 +214,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
         param.k_ptrs,
         param.v_ptrs,
         param.out_ptrs,
-        param.randvals_ptrs,
-        param.logsumexp_ptrs,
         param.attn_bias_ptrs,
         {}, // p_acc1_biases
         problem_descs,
@@ -235,9 +221,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
         b0_element_op,
         acc0_element_op,
         b1_element_op,
-        c_element_op,
-        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+        c_element_op);
 
     auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
 

From c30eb90c0d7a93c5926f89d7b6645ebabf9d30ee Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 13 Oct 2023 16:02:03 +0000
Subject: [PATCH 089/641] Synchronize with latest CK flashAttention

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 3f4eae1db..ca9b152df 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 3f4eae1db4d73cf1692b204425591660cfd421be
+Subproject commit ca9b152df45b394590d4348f41365b775a72ba2c

From ab9a9b052206421beca7c582daf39fe5ea0e0873 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 15 Oct 2023 23:24:44 +0000
Subject: [PATCH 090/641] Use different instances according to the head-dim
 sizes in batched infer

---
 .../hip_fmha/ck_fmha_batched_infer.h          | 288 +++++++++++++-----
 1 file changed, 218 insertions(+), 70 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 870d1394e..e72b6b773 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -62,78 +62,226 @@ struct batched_infer_masktype_attnbias_dispatched {
     constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
     constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
 
-    using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
-            NumDimG,
-            NumDimM,
-            NumDimN,
-            NumDimK,
-            NumDimO,
-            ADataType,
-            B0DataType,
-            B1DataType,
-            CDataType,
-            Acc0BiasDataType,
-            Acc1BiasDataType,
-            AccDataType,
-            CShuffleDataType,
-            AElementOp,
-            B0ElementOp,
-            Acc0ElementOp,
-            B1ElementOp,
-            CElementOp,
-            GemmSpec,
-            TensorSpecA,
-            TensorSpecB0,
-            TensorSpecB1,
-            TensorSpecC,
-            1,
-            256,
-            128, // MPerBlock
-            128, // NPerBlock
-            32, // KPerBlock
-            64, // Gemm1NPerBlock
-            32, // Gemm1KPerBlock
-            8, // AK1
-            8, // BK1
-            2, // B1K1
-            32, // MPerXDL
-            32, // NPerXDL
-            1, // MXdlPerWave
-            4, // NXdlPerWave
-            2, // Gemm1NXdlPerWave
-            S<4, 64, 1>, // ABlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            S<4, 64, 1>, // BBlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-            S<16, 16, 1>, // B1BlockTransfer
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            2,
-            false,
-            1, // CShuffleMXdlPerWavePerShuffle
-            2, // CShuffleNXdlPerWavePerShuffle
-            S<1,
-              32,
+    if (param.K < 32 && param.Kv < 32) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              ADataType,
+              B0DataType,
+              B1DataType,
+              CDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              CShuffleDataType,
+              AElementOp,
+              B0ElementOp,
+              Acc0ElementOp,
+              B1ElementOp,
+              CElementOp,
+              GemmSpec,
+              TensorSpecA,
+              TensorSpecB0,
+              TensorSpecB1,
+              TensorSpecC,
               1,
-              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            MaskingSpec>; // MaskingSpecialization
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              32, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              1, // MXdlPerWave
+              4, // NXdlPerWave
+              1, // Gemm1NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              S<16, 16, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              1, // CShuffleNXdlPerWavePerShuffle
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec>; // MaskingSpecialization
 
-    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else if (param.K < 64 && param.Kv < 64) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              ADataType,
+              B0DataType,
+              B1DataType,
+              CDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              CShuffleDataType,
+              AElementOp,
+              B0ElementOp,
+              Acc0ElementOp,
+              B1ElementOp,
+              CElementOp,
+              GemmSpec,
+              TensorSpecA,
+              TensorSpecB0,
+              TensorSpecB1,
+              TensorSpecC,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              64, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              1, // MXdlPerWave
+              4, // NXdlPerWave
+              2, // Gemm1NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              S<16, 16, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              2, // CShuffleNXdlPerWavePerShuffle
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec>; // MaskingSpecialization
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              ADataType,
+              B0DataType,
+              B1DataType,
+              CDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              CShuffleDataType,
+              AElementOp,
+              B0ElementOp,
+              Acc0ElementOp,
+              B1ElementOp,
+              CElementOp,
+              GemmSpec,
+              TensorSpecA,
+              TensorSpecB0,
+              TensorSpecB1,
+              TensorSpecC,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              128, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              1, // MXdlPerWave
+              4, // NXdlPerWave
+              4, // Gemm1NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              S<16, 16, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              4, // CShuffleNXdlPerWavePerShuffle
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec>; // MaskingSpecialization
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    };
   };
 
   template <typename DeviceOpInstance>

From a47d2229d23e740033ecbf58417210512e41a08b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 15 Oct 2023 23:52:01 +0000
Subject: [PATCH 091/641] Use different instances according to the head-dim
 sizes in grouped infer

---
 .../hip_fmha/ck_fmha_grouped_infer.h          | 290 +++++++++++++-----
 1 file changed, 219 insertions(+), 71 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 321b17cdd..2a6faf540 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -63,78 +63,226 @@ struct grouped_infer_masktype_attnbias_dispatched {
     constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
     constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
 
-    using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
-            NumDimG,
-            NumDimM,
-            NumDimN,
-            NumDimK,
-            NumDimO,
-            ADataType,
-            B0DataType,
-            B1DataType,
-            CDataType,
-            Acc0BiasDataType,
-            Acc1BiasDataType,
-            AccDataType,
-            CShuffleDataType,
-            AElementOp,
-            B0ElementOp,
-            Acc0ElementOp,
-            B1ElementOp,
-            CElementOp,
-            GemmSpec,
-            TensorSpecA,
-            TensorSpecB0,
-            TensorSpecB1,
-            TensorSpecC,
-            1,
-            256,
-            128, // MPerBlock
-            128, // NPerBlock
-            32, // KPerBlock
-            128, // Gemm1NPerBlock
-            32, // Gemm1KPerBlock
-            8, // AK1
-            8, // BK1
-            2, // B1K1
-            32, // MPerXDL
-            32, // NPerXDL
-            1, // MXdlPerWave
-            4, // NXdlPerWave
-            4, // Gemm1NXdlPerWave
-            S<4, 64, 1>, // ABlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            S<4, 64, 1>, // BBlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            Acc0BiasTransferSrcScalarPerVector,
-            S<8, 32, 1>, // B1BlockTransfer
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            2,
-            false,
-            1, // CShuffleMXdlPerWavePerShuffle
-            2, // CShuffleNXdlPerWavePerShuffle
-            S<1,
-              32,
+    if (param.K < 32 && param.Kv < 32) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              ADataType,
+              B0DataType,
+              B1DataType,
+              CDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              CShuffleDataType,
+              AElementOp,
+              B0ElementOp,
+              Acc0ElementOp,
+              B1ElementOp,
+              CElementOp,
+              GemmSpec,
+              TensorSpecA,
+              TensorSpecB0,
+              TensorSpecB1,
+              TensorSpecC,
               1,
-              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-            MaskingSpec>; // MaskingSpecialization
-
-    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              32, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              1, // MXdlPerWave
+              4, // NXdlPerWave
+              1, // Gemm1NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector,
+              S<8, 32, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              1, // CShuffleNXdlPerWavePerShuffle
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec>; // MaskingSpecialization
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else if (param.K < 64 && param.Kv < 64) {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              ADataType,
+              B0DataType,
+              B1DataType,
+              CDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              CShuffleDataType,
+              AElementOp,
+              B0ElementOp,
+              Acc0ElementOp,
+              B1ElementOp,
+              CElementOp,
+              GemmSpec,
+              TensorSpecA,
+              TensorSpecB0,
+              TensorSpecB1,
+              TensorSpecC,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              64, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              1, // MXdlPerWave
+              4, // NXdlPerWave
+              2, // Gemm1NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector,
+              S<8, 32, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              2, // CShuffleNXdlPerWavePerShuffle
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec>; // MaskingSpecialization
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else {
+      using DeviceOpInstance = ck::tensor_operation::device::
+          DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
+              NumDimG,
+              NumDimM,
+              NumDimN,
+              NumDimK,
+              NumDimO,
+              ADataType,
+              B0DataType,
+              B1DataType,
+              CDataType,
+              Acc0BiasDataType,
+              Acc1BiasDataType,
+              AccDataType,
+              CShuffleDataType,
+              AElementOp,
+              B0ElementOp,
+              Acc0ElementOp,
+              B1ElementOp,
+              CElementOp,
+              GemmSpec,
+              TensorSpecA,
+              TensorSpecB0,
+              TensorSpecB1,
+              TensorSpecC,
+              1,
+              256,
+              128, // MPerBlock
+              128, // NPerBlock
+              32, // KPerBlock
+              128, // Gemm1NPerBlock
+              32, // Gemm1KPerBlock
+              8, // AK1
+              8, // BK1
+              2, // B1K1
+              32, // MPerXDL
+              32, // NPerXDL
+              1, // MXdlPerWave
+              4, // NXdlPerWave
+              4, // Gemm1NXdlPerWave
+              S<4, 64, 1>, // ABlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              S<4, 64, 1>, // BBlockTransfer
+              S<1, 0, 2>,
+              S<1, 0, 2>,
+              2,
+              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              8,
+              true,
+              Acc0BiasTransferSrcScalarPerVector,
+              S<8, 32, 1>, // B1BlockTransfer
+              S<0, 2, 1>,
+              S<0, 2, 1>,
+              1,
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              2,
+              false,
+              1, // CShuffleMXdlPerWavePerShuffle
+              4, // CShuffleNXdlPerWavePerShuffle
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              MaskingSpec>; // MaskingSpecialization
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    };
   };
 
   template <typename DeviceOpInstance>

From d408c83cbe30d6c529e24365b6b8eee139a37a9d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 17 Oct 2023 22:19:34 +0000
Subject: [PATCH 092/641] Tiny fix for packed q/k/v allocation

---
 xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 1a3b16b1e..a234df42a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -135,6 +135,7 @@ efficient_attention_backward_ck(
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
+    grad_q.fill_(0);
   } else if (
       key.size(3) == value.size(3) &&
       key.storage().is_alias_of(value.storage())) {

From 93ef74e693979c437352c47c6d6d2a7be9a3e593 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 18 Oct 2023 17:52:27 +0000
Subject: [PATCH 093/641] Reset the flash-attention submodule to a commit so
 that our branch can be build on Nvidia/A100

---
 third_party/flash-attention | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/flash-attention b/third_party/flash-attention
index eff9fe6b8..9e5e8bc91 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit eff9fe6b8076df59d64d7a3f464696738a3c7c24
+Subproject commit 9e5e8bc91e30af5cdc321362b553f6c0da332e30

From 5214bf2421d32a5f86875e360c012758d0dcb995 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 18 Oct 2023 18:15:02 +0000
Subject: [PATCH 094/641] Use the same tested attn_bias types as cutlass.py and
 have test_backward passed all fp16 cases

---
 tests/test_mem_eff_attention_ck.py | 18 ++++++++++++++++--
 xformers/ops/fmha/ck.py            |  3 ++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index fdfeb40e9..230477f09 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -575,7 +575,7 @@ def test_forward(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
     if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
 
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
@@ -730,6 +730,20 @@ def test_backward(
         k,
         kv,
     ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    ## ToDo: reopen bfloat16 for testing
+    if dtype is torch.bfloat16:
+        pytest.skip("Temporarily disabled bfloat16 as we are still improving the accuracy of the results")
+
+    if k > 128 or kv > 128:
+        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
+
+    if k % 2 != 0 or kv % 2 !=0:
+        pytest.skip("head-dim length must be an even value for CK-FlashAttention")
+
+    if grad_out_contiguous is False:
+        pytest.skip("CK-FlashAttention requires grad_out and out have same lengths/strides")
+
     attn_bias_requires_grad = (
         random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
     )
@@ -1726,7 +1740,7 @@ def test_f16_biasf32(self) -> None:
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(fmha.ck.FwOp, None))
 
     def test_f32_biasf16(self) -> None:
-        pytest.skip("float32 is not supported currently by CK-FlashAttention-1")
+        pytest.skip("float32 is not supported currently by CK-FlashAttention")
         q, k, v, bias = self.create_tensors(torch.float32)
         fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
         bias = bias.to(torch.float16)
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 5f201f603..143c74f79 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -243,7 +243,8 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
-        LowerTriangularMaskWithTensorBias,
+        # TODO: Fix handling of gradient through the fMHA autograd function
+        # LowerTriangularMaskWithTensorBias,
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,

From fde7b42c4ff7e589e7aae6c4ee65f9e74da4aef2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 18 Oct 2023 19:41:14 +0000
Subject: [PATCH 095/641] Move to the latest composable_kernel submodule commit

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index ca9b152df..f27f91581 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit ca9b152df45b394590d4348f41365b775a72ba2c
+Subproject commit f27f91581162c788f144f0f4f9aa68fa465a33fc

From 17635e0138d910f6ea3bd73a4f728920aea9a7c7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 18 Oct 2023 23:55:35 +0000
Subject: [PATCH 096/641] Simplify the head-dim based switch structure in
 batched/grouped infer

---
 .../hip_fmha/ck_fmha_batched_infer.h          | 318 ++++++------------
 .../hip_fmha/ck_fmha_grouped_infer.h          | 318 ++++++------------
 2 files changed, 206 insertions(+), 430 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index e72b6b773..0f6e106cb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -56,229 +56,117 @@ struct batched_infer_masktype_attnbias_dispatched {
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
 
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    // Tunables
-    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kB1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::
+      DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          kGemm1NPerBlock,
+          32,
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          kGemm1NXdlPerWave,
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
+          S<16, 16, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          kCShuffleNXdlPerWavePerShuffle,
+          S<1,
+            32,
+            1,
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          MaskingSpec>; // MaskingSpecialization
 
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
     if (param.K < 32 && param.Kv < 32) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              ADataType,
-              B0DataType,
-              B1DataType,
-              CDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              CShuffleDataType,
-              AElementOp,
-              B0ElementOp,
-              Acc0ElementOp,
-              B1ElementOp,
-              CElementOp,
-              GemmSpec,
-              TensorSpecA,
-              TensorSpecB0,
-              TensorSpecB1,
-              TensorSpecC,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              32, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              1, // MXdlPerWave
-              4, // NXdlPerWave
-              1, // Gemm1NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-              S<16, 16, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              1, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec>; // MaskingSpecialization
+      constexpr ck::index_t kGemm1NPerBlock = 32;
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else if (param.K < 64 && param.Kv < 64) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              ADataType,
-              B0DataType,
-              B1DataType,
-              CDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              CShuffleDataType,
-              AElementOp,
-              B0ElementOp,
-              Acc0ElementOp,
-              B1ElementOp,
-              CElementOp,
-              GemmSpec,
-              TensorSpecA,
-              TensorSpecB0,
-              TensorSpecB1,
-              TensorSpecC,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              64, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              1, // MXdlPerWave
-              4, // NXdlPerWave
-              2, // Gemm1NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-              S<16, 16, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              2, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec>; // MaskingSpecialization
+      constexpr ck::index_t kGemm1NPerBlock = 64;
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              ADataType,
-              B0DataType,
-              B1DataType,
-              CDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              CShuffleDataType,
-              AElementOp,
-              B0ElementOp,
-              Acc0ElementOp,
-              B1ElementOp,
-              CElementOp,
-              GemmSpec,
-              TensorSpecA,
-              TensorSpecB0,
-              TensorSpecB1,
-              TensorSpecC,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              128, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              1, // MXdlPerWave
-              4, // NXdlPerWave
-              4, // Gemm1NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
-              S<16, 16, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              4, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec>; // MaskingSpecialization
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 2a6faf540..918020eba 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -57,229 +57,117 @@ struct grouped_infer_masktype_attnbias_dispatched {
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
 
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    // Tunables
-    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kB1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::
+      DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          kGemm1NPerBlock,
+          32,
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          kGemm1NXdlPerWave,
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          kAcc0BiasTransferSrcScalarPerVector,
+          S<8, 32, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          kCShuffleNXdlPerWavePerShuffle,
+          S<1,
+            32,
+            1,
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          MaskingSpec>; // MaskingSpecialization
 
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
     if (param.K < 32 && param.Kv < 32) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              ADataType,
-              B0DataType,
-              B1DataType,
-              CDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              CShuffleDataType,
-              AElementOp,
-              B0ElementOp,
-              Acc0ElementOp,
-              B1ElementOp,
-              CElementOp,
-              GemmSpec,
-              TensorSpecA,
-              TensorSpecB0,
-              TensorSpecB1,
-              TensorSpecC,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              32, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              1, // MXdlPerWave
-              4, // NXdlPerWave
-              1, // Gemm1NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              Acc0BiasTransferSrcScalarPerVector,
-              S<8, 32, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              1, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec>; // MaskingSpecialization
+      constexpr ck::index_t kGemm1NPerBlock = 32;
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else if (param.K < 64 && param.Kv < 64) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              ADataType,
-              B0DataType,
-              B1DataType,
-              CDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              CShuffleDataType,
-              AElementOp,
-              B0ElementOp,
-              Acc0ElementOp,
-              B1ElementOp,
-              CElementOp,
-              GemmSpec,
-              TensorSpecA,
-              TensorSpecB0,
-              TensorSpecB1,
-              TensorSpecC,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              64, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              1, // MXdlPerWave
-              4, // NXdlPerWave
-              2, // Gemm1NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              Acc0BiasTransferSrcScalarPerVector,
-              S<8, 32, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              2, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec>; // MaskingSpecialization
+      constexpr ck::index_t kGemm1NPerBlock = 64;
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              ADataType,
-              B0DataType,
-              B1DataType,
-              CDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              CShuffleDataType,
-              AElementOp,
-              B0ElementOp,
-              Acc0ElementOp,
-              B1ElementOp,
-              CElementOp,
-              GemmSpec,
-              TensorSpecA,
-              TensorSpecB0,
-              TensorSpecB1,
-              TensorSpecC,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              128, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              1, // MXdlPerWave
-              4, // NXdlPerWave
-              4, // Gemm1NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              Acc0BiasTransferSrcScalarPerVector,
-              S<8, 32, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              4, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec>; // MaskingSpecialization
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     };

From 9aa4ad31f17166eee3ce8cb4f1361dda3d822feb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 19 Oct 2023 12:34:13 +0000
Subject: [PATCH 097/641] Tiny fix in inference instance dispatch

---
 tests/test_mem_eff_attention_ck.py                       | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h | 4 ++--
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 230477f09..787c9b3f2 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -738,7 +738,7 @@ def test_backward(
     if k > 128 or kv > 128:
         pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
 
-    if k % 2 != 0 or kv % 2 !=0:
+    if k % 2 != 0:
         pytest.skip("head-dim length must be an even value for CK-FlashAttention")
 
     if grad_out_contiguous is False:
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 0f6e106cb..e5396d437 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -136,7 +136,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           MaskingSpec>; // MaskingSpecialization
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    if (param.K < 32 && param.Kv < 32) {
+    if (param.K <= 32 && param.Kv <= 32) {
       constexpr ck::index_t kGemm1NPerBlock = 32;
       constexpr ck::index_t kGemm1NXdlPerWave = 1;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
@@ -147,7 +147,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else if (param.K < 64 && param.Kv < 64) {
+    } else if (param.K <= 64 && param.Kv <= 64) {
       constexpr ck::index_t kGemm1NPerBlock = 64;
       constexpr ck::index_t kGemm1NXdlPerWave = 2;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 918020eba..22faf161b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -137,7 +137,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           MaskingSpec>; // MaskingSpecialization
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    if (param.K < 32 && param.Kv < 32) {
+    if (param.K <= 32 && param.Kv <= 32) {
       constexpr ck::index_t kGemm1NPerBlock = 32;
       constexpr ck::index_t kGemm1NXdlPerWave = 1;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
@@ -148,7 +148,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           kCShuffleNXdlPerWavePerShuffle>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else if (param.K < 64 && param.Kv < 64) {
+    } else if (param.K <= 64 && param.Kv <= 64) {
       constexpr ck::index_t kGemm1NPerBlock = 64;
       constexpr ck::index_t kGemm1NXdlPerWave = 2;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;

From 7b41d9e604677bfda46670e85b5733292315923f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 19 Oct 2023 16:38:01 +0000
Subject: [PATCH 098/641] Use Deterministic (true) in backward instances

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index f339691a7..be1a91b3d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -55,7 +55,7 @@ struct batched_backward_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecY =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
+  static constexpr bool Deterministic = true;
 
   static void Run(BatchedBackwardParams& param, hipStream_t stream) {
     // Tunables
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index a93e67082..f6cd4d732 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -56,7 +56,7 @@ struct grouped_backward_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr auto TensorSpecY =
       ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr bool Deterministic = false;
+  static constexpr bool Deterministic = true;
 
   static void Run(GroupedBackwardParams& param, hipStream_t stream) {
     // Tunables

From 329fee186c95d247403defde9a475881d9d01555 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 19 Oct 2023 23:28:33 +0000
Subject: [PATCH 099/641] Add env-variable for enable/disable fp32 gradience
 output for q/k/v

---
 .../hip_fmha/attention_backward_generic.cpp   |  63 +++++++--
 .../hip_fmha/ck_fmha_batched_backward.h       |   9 +-
 .../ck_fmha_batched_backward_bp16.cpp         | 123 +++++++++++++----
 ..._fmha_batched_backward_bp16_masktype_0.cpp |  28 ++++
 ..._fmha_batched_backward_bp16_masktype_1.cpp |  28 ++++
 ..._fmha_batched_backward_bp16_masktype_2.cpp |  28 ++++
 .../ck_fmha_batched_backward_fp16.cpp         | 123 +++++++++++++----
 ..._fmha_batched_backward_fp16_masktype_0.cpp |  28 ++++
 ..._fmha_batched_backward_fp16_masktype_1.cpp |  28 ++++
 ..._fmha_batched_backward_fp16_masktype_2.cpp |  28 ++++
 .../hip_fmha/ck_fmha_grouped_backward.h       |   9 +-
 .../ck_fmha_grouped_backward_bp16.cpp         | 124 ++++++++++++++----
 ..._fmha_grouped_backward_bp16_masktype_0.cpp |  28 ++++
 ..._fmha_grouped_backward_bp16_masktype_1.cpp |  29 ++++
 ..._fmha_grouped_backward_bp16_masktype_2.cpp |  28 ++++
 .../ck_fmha_grouped_backward_fp16.cpp         | 123 +++++++++++++----
 ..._fmha_grouped_backward_fp16_masktype_0.cpp |  28 ++++
 ..._fmha_grouped_backward_fp16_masktype_1.cpp |  28 ++++
 ..._fmha_grouped_backward_fp16_masktype_2.cpp |  28 ++++
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |   4 +
 .../attention/hip_fmha/ck_static_switch.h     |  23 ++++
 21 files changed, 832 insertions(+), 106 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_static_switch.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index a234df42a..c142352e0 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -1,4 +1,5 @@
 #include <cmath>
+#include <cstdlib>
 
 #include <ATen/Context.h>
 #include <ATen/ScalarOps.h>
@@ -109,6 +110,12 @@ efficient_attention_backward_ck(
     TORCH_CHECK(max_seqlen_q_.has_value());
   }
 
+  bool use_fp32_qkv_grad = false;
+
+  if (const char* env_str = std::getenv("USE_FP32_QKV_GRAD")) {
+    use_fp32_qkv_grad = (std::stoi(env_str) > 0) ? true : false;
+  };
+
   // at::cuda::CUDAGuard device_guard(query.device());
   hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
 
@@ -131,7 +138,11 @@ efficient_attention_backward_ck(
     // output of a linear layer that is chunked.
     // Creating the gradients with the right layout saves us
     // a `torch.cat` call in the backward pass
-    at::Tensor chunk = at::empty({B, M, 3, num_heads, K}, opts);
+    at::Tensor chunk;
+    if (use_fp32_qkv_grad)
+      chunk = at::empty({B, M, 3, num_heads, K}, opts.dtype(at::kFloat));
+    else
+      chunk = at::empty({B, M, 3, num_heads, K}, opts);
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
@@ -144,16 +155,36 @@ efficient_attention_backward_ck(
     // output of a linear layer that is chunked.
     // Creating the gradients with the right layout saves us
     // a `torch.cat` call in the backward pass
-    at::Tensor chunk = at::empty({B, N, 2, num_heads, Kv}, opts);
+    at::Tensor chunk;
+    if (use_fp32_qkv_grad)
+      chunk = at::empty({B, N, 2, num_heads, Kv}, opts.dtype(at::kFloat));
+    else
+      chunk = at::empty({B, N, 2, num_heads, Kv}, opts);
     grad_k = chunk.select(2, 0);
     grad_v = chunk.select(2, 1);
 
-    grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+    if (use_fp32_qkv_grad)
+      grad_q = at::empty_strided(
+          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
+    else
+      grad_q =
+          at::empty_strided(query.sizes(), query.strides(), query.options());
     grad_q.fill_(0);
   } else {
-    grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
-    grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
-    grad_v = at::empty_strided(value.sizes(), value.strides(), value.options());
+    if (use_fp32_qkv_grad) {
+      grad_q = at::empty_strided(
+          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
+      grad_k = at::empty_strided(
+          key.sizes(), key.strides(), key.options().dtype(at::kFloat));
+      grad_v = at::empty_strided(
+          value.sizes(), value.strides(), value.options().dtype(at::kFloat));
+    } else {
+      grad_q =
+          at::empty_strided(query.sizes(), query.strides(), query.options());
+      grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
+      grad_v =
+          at::empty_strided(value.sizes(), value.strides(), value.options());
+    }
     grad_q.fill_(0);
   }
 
@@ -167,6 +198,8 @@ efficient_attention_backward_ck(
 
   const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
 
+  // even it is an output, the grad_bias is required to use the same data-type
+  // as bias in CK-FlashAttn
   if (bias_requires_grad)
     grad_bias =
         at::empty_strided(bias->sizes(), bias->strides(), bias->options());
@@ -179,6 +212,8 @@ efficient_attention_backward_ck(
     p.K = K;
     p.Kv = Kv;
 
+    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+
     TORCH_CHECK(p.B == logsumexp.size(0));
     TORCH_CHECK(p.num_heads == logsumexp.size(1));
     TORCH_CHECK(p.M == logsumexp.size(2));
@@ -263,6 +298,8 @@ efficient_attention_backward_ck(
     p.K = K;
     p.Kv = Kv;
 
+    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+
     p.max_seqlen_q = *max_seqlen_q_;
 
     TORCH_CHECK(p.num_batches == logsumexp.size(0));
@@ -357,6 +394,14 @@ efficient_attention_backward_ck(
         ? reinterpret_cast<char*>(grad_bias.data_ptr())
         : nullptr;
 
+    int multiplier = 1;
+
+    if (p.use_fp32_qkv_grad)
+      multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
+          get_size_in_bytes(1, query.scalar_type());
+
+    std::cout << "qkv-grad precision multiplier is " << multiplier << std::endl;
+
     for (int i = 0; i < p.num_batches; i++) {
       size_t tmp_q_offset = get_size_in_bytes(
           static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
@@ -376,15 +421,15 @@ efficient_attention_backward_ck(
 
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.grad_q_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset]));
+          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
 
       p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
       p.grad_k_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_k_ptr[tmp_k_offset]));
+          reinterpret_cast<void*>(&grad_k_ptr[tmp_k_offset * multiplier]));
 
       p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
       p.grad_v_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_v_ptr[tmp_v_offset]));
+          reinterpret_cast<void*>(&grad_v_ptr[tmp_v_offset * multiplier]));
 
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
       p.grad_out_ptrs.push_back(
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index be1a91b3d..317e3b54c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -13,7 +13,11 @@
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
 struct batched_backward_masktype_attnbias_dispatched {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
   using Scale = ck::tensor_operation::element_wise::Scale;
@@ -22,7 +26,8 @@ struct batched_backward_masktype_attnbias_dispatched {
   using YElementOp = PassThrough;
 
   using InputDataType = scalar_t;
-  using OutputDataType = scalar_t;
+  using OutputDataType =
+      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
   using GemmDataType = scalar_t;
   using AccDataType = F32;
   using ShuffleDataType = F32;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 8f23dc9b3..5b6ec3c2b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -2,29 +2,106 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_backward.h"
+#include "ck_static_switch.h"
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>;
 
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
-          param, stream);
-    else
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
-          param, stream);
-    else
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
-    else
-      batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0)
+          batched_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        else if (param.custom_mask_type == 1)
+          batched_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        else if (param.custom_mask_type == 2)
+          batched_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp
new file mode 100644
index 000000000..3b27b27f7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp
@@ -0,0 +1,28 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp
new file mode 100644
index 000000000..a59443dc0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp
@@ -0,0 +1,28 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp
new file mode 100644
index 000000000..28396507c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp
@@ -0,0 +1,28 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index dd77a559a..a6f09ea54 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -2,29 +2,106 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_backward.h"
+#include "ck_static_switch.h"
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>;
+
+extern template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>;
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
-          param, stream);
-    else
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
-          param, stream);
-    else
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
-          param, stream);
-    else
-      batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0)
+          batched_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        else if (param.custom_mask_type == 1)
+          batched_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        else if (param.custom_mask_type == 2)
+          batched_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp
new file mode 100644
index 000000000..6b6d09949
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp
@@ -0,0 +1,28 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp
new file mode 100644
index 000000000..c11fb2535
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp
@@ -0,0 +1,28 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp
new file mode 100644
index 000000000..9dc0df5e9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp
@@ -0,0 +1,28 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index f6cd4d732..e0446bbcb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -14,7 +14,11 @@
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
 struct grouped_backward_masktype_attnbias_dispatched {
   using PassThrough = ck::tensor_operation::element_wise::PassThrough;
   using Scale = ck::tensor_operation::element_wise::Scale;
@@ -23,7 +27,8 @@ struct grouped_backward_masktype_attnbias_dispatched {
   using YElementOp = PassThrough;
 
   using InputDataType = scalar_t;
-  using OutputDataType = scalar_t;
+  using OutputDataType =
+      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
   using GemmDataType = scalar_t;
   using AccDataType = F32;
   using ShuffleDataType = F32;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index 5a9c50ba5..2d18eefe6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -2,30 +2,106 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_backward.h"
+#include "ck_static_switch.h"
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>;
 
 void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
-          param, stream);
-    else
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
-          param, stream);
-    else
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
-          param, stream);
-
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
-    else
-      grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0) {
+          grouped_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        } else if (param.custom_mask_type == 1) {
+          grouped_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        } else if (param.custom_mask_type == 2) {
+          grouped_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        } else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp
new file mode 100644
index 000000000..703176268
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
new file mode 100644
index 000000000..2892cd129
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
@@ -0,0 +1,29 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+#include "ck_static_switch.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp
new file mode 100644
index 000000000..535ea659d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index 450632bd3..e06a7dc58 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -2,29 +2,106 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_backward.h"
+#include "ck_static_switch.h"
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>;
+
+extern template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>;
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
-          param, stream);
-    else
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
-          param, stream);
-    else
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
-          param, stream);
-    else
-      grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0) {
+          grouped_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        } else if (param.custom_mask_type == 1) {
+          grouped_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        } else if (param.custom_mask_type == 2) {
+          grouped_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>::Run(param, stream);
+        } else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp
new file mode 100644
index 000000000..409c2d159
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp
new file mode 100644
index 000000000..9662fe529
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp
new file mode 100644
index 000000000..d13fd9b05
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp
@@ -0,0 +1,28 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 73961d0a8..2778da001 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -105,6 +105,8 @@ struct BatchedBackwardParams {
   bool has_attn_bias;
   bool bias_has_grad;
 
+  bool use_fp32_qkv_grad;
+
   // BMHK mode strides, last-dim contiguous
   std::array<int, 4> q_strides;
   std::array<int, 4> k_strides;
@@ -152,6 +154,8 @@ struct GroupedBackwardParams {
   bool has_attn_bias;
   bool bias_has_grad;
 
+  bool use_fp32_qkv_grad;
+
   // MHK mode strides, last-dim contiguous
   std::array<int, 3> q_strides;
   std::array<int, 3> k_strides;
diff --git a/xformers/csrc/attention/hip_fmha/ck_static_switch.h b/xformers/csrc/attention/hip_fmha/ck_static_switch.h
new file mode 100644
index 000000000..4e447a143
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_static_switch.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#define BOOL_SWITCH_1(COND1, CONST_NAME1, ...) \
+  [&] {                                        \
+    if (COND1) {                               \
+      constexpr bool CONST_NAME1 = true;       \
+      __VA_ARGS__();                           \
+    } else {                                   \
+      constexpr bool CONST_NAME1 = false;      \
+      __VA_ARGS__();                           \
+    }                                          \
+  }()
+
+#define BOOL_SWITCH_2(COND1, CONST_NAME1, COND2, CONST_NAME2, ...) \
+  [&] {                                                            \
+    if (COND1) {                                                   \
+      constexpr bool CONST_NAME1 = true;                           \
+      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);             \
+    } else {                                                       \
+      constexpr bool CONST_NAME1 = false;                          \
+      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);             \
+    }                                                              \
+  }()

From a59e87c29133e9144e24505cecea7e817130e7ad Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 20 Oct 2023 00:38:52 +0000
Subject: [PATCH 100/641] Simplify dispatching using BOOL_SWITCH and accelerate
 compiling by splitting C++ files (forward)

---
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp | 71 +++++++++++++------
 ...k_fmha_batched_forward_bp16_masktype_0.cpp | 14 ++++
 ...k_fmha_batched_forward_bp16_masktype_1.cpp | 14 ++++
 ...k_fmha_batched_forward_bp16_masktype_2.cpp | 14 ++++
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp | 71 +++++++++++++------
 ...k_fmha_batched_forward_fp16_masktype_0.cpp | 14 ++++
 ...k_fmha_batched_forward_fp16_masktype_1.cpp | 14 ++++
 ...k_fmha_batched_forward_fp16_masktype_2.cpp | 14 ++++
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp | 53 +++++++++-----
 ...k_fmha_grouped_forward_bp16_masktype_0.cpp | 14 ++++
 ...k_fmha_grouped_forward_bp16_masktype_1.cpp | 14 ++++
 ...k_fmha_grouped_forward_bp16_masktype_2.cpp | 14 ++++
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp | 53 +++++++++-----
 ...k_fmha_grouped_forward_fp16_masktype_0.cpp | 14 ++++
 ...k_fmha_grouped_forward_fp16_masktype_1.cpp | 14 ++++
 ...k_fmha_grouped_forward_fp16_masktype_2.cpp | 14 ++++
 16 files changed, 340 insertions(+), 76 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 7be431c38..6deae7724 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -2,29 +2,56 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_forward.h"
+#include "ck_static_switch.h"
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
 
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      batched_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 1)
+      batched_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 2)
+      batched_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp
new file mode 100644
index 000000000..3813bfbe2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp
new file mode 100644
index 000000000..7ea33a2a9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp
new file mode 100644
index 000000000..732704f62
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index 543a2c253..7e4b9cb8c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -2,29 +2,56 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_forward.h"
+#include "ck_static_switch.h"
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
+
+extern template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
-          param, stream);
-    else
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
-          param, stream);
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      batched_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 1)
+      batched_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 2)
+      batched_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else
-      batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp
new file mode 100644
index 000000000..a9fbc47d7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp
new file mode 100644
index 000000000..7712f091f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp
new file mode 100644
index 000000000..45874124e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index e459d16d9..00f92bdae 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -2,29 +2,50 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_forward.h"
+#include "ck_static_switch.h"
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
 
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
       grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
           param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
+    else if (param.custom_mask_type == 1)
       grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
           param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
+    else if (param.custom_mask_type == 2)
       grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp
new file mode 100644
index 000000000..55629443b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp
new file mode 100644
index 000000000..c1ed66880
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp
new file mode 100644
index 000000000..e41a76278
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index cadc30b4b..e3b0736b8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -2,29 +2,50 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_forward.h"
+#include "ck_static_switch.h"
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
+
+extern template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
       grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
           param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
+    else if (param.custom_mask_type == 1)
       grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
           param, stream);
-    else
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
+    else if (param.custom_mask_type == 2)
       grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
           param, stream);
     else
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp
new file mode 100644
index 000000000..3a2c45e6f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp
new file mode 100644
index 000000000..83b62defc
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp
new file mode 100644
index 000000000..7ef8f40a2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;

From 30fc69f56720ed1b93a32b42d50d6f4cf8bdc5ce Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 20 Oct 2023 14:13:19 +0000
Subject: [PATCH 101/641] Simplify dispatching using BOOL_SWITCH and accelerate
 compiling by splitting C++ files (infer)

---
 .../hip_fmha/ck_fmha_batched_infer_bp16.cpp   | 71 +++++++++++++------
 .../ck_fmha_batched_infer_bp16_masktype_0.cpp | 14 ++++
 .../ck_fmha_batched_infer_bp16_masktype_1.cpp | 14 ++++
 .../ck_fmha_batched_infer_bp16_masktype_2.cpp | 14 ++++
 .../hip_fmha/ck_fmha_batched_infer_fp16.cpp   | 65 +++++++++++------
 .../ck_fmha_batched_infer_fp16_masktype_0.cpp | 11 +++
 .../ck_fmha_batched_infer_fp16_masktype_1.cpp | 11 +++
 .../ck_fmha_batched_infer_fp16_masktype_2.cpp | 11 +++
 .../hip_fmha/ck_fmha_grouped_infer_bp16.cpp   | 71 +++++++++++++------
 .../ck_fmha_grouped_infer_bp16_masktype_0.cpp | 14 ++++
 .../ck_fmha_grouped_infer_bp16_masktype_1.cpp | 14 ++++
 .../ck_fmha_grouped_infer_bp16_masktype_2.cpp | 14 ++++
 .../hip_fmha/ck_fmha_grouped_infer_fp16.cpp   | 65 +++++++++++------
 .../ck_fmha_grouped_infer_fp16_masktype_0.cpp | 11 +++
 .../ck_fmha_grouped_infer_fp16_masktype_1.cpp | 11 +++
 .../ck_fmha_grouped_infer_fp16_masktype_2.cpp | 11 +++
 16 files changed, 334 insertions(+), 88 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
index bd62aebe2..5d44a4e99 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
@@ -2,29 +2,56 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
+#include "ck_static_switch.h"
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
-          param, stream);
-    else
-      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
-          param, stream);
-    else
-      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 1)
+      batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 2)
+      batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else
-      batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp
new file mode 100644
index 000000000..7d0a4c910
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp
new file mode 100644
index 000000000..5aad14a67
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp
new file mode 100644
index 000000000..e0ddb158d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
index 3429c088e..fa0bdd42d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
@@ -2,29 +2,50 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
+#include "ck_static_switch.h"
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
+
+extern template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
-          param, stream);
-    else
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
-          param, stream);
-    else
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
-          param, stream);
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>::
+          Run(param, stream);
+    else if (param.custom_mask_type == 1)
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>::
+          Run(param, stream);
+    else if (param.custom_mask_type == 2)
+      batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>::
+          Run(param, stream);
     else
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp
new file mode 100644
index 000000000..fa3ac06cd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp
@@ -0,0 +1,11 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp
new file mode 100644
index 000000000..ea4833f23
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp
@@ -0,0 +1,11 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp
new file mode 100644
index 000000000..54c046e61
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp
@@ -0,0 +1,11 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
index d3accc720..796372951 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
@@ -2,29 +2,56 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
+#include "ck_static_switch.h"
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
-          param, stream);
-    else
-      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
-          param, stream);
-    else
-      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 1)
+      grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>::Run(param, stream);
+    else if (param.custom_mask_type == 2)
+      grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else
-      grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp
new file mode 100644
index 000000000..6b6658de6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp
new file mode 100644
index 000000000..232517d2b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp
new file mode 100644
index 000000000..19e58447a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
index d2e846683..ffc89ed53 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
@@ -2,29 +2,50 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
+#include "ck_static_switch.h"
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
+
+extern template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0) {
-    if (param.has_attn_bias)
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
-          param, stream);
-    else
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 1) {
-    if (param.has_attn_bias)
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
-          param, stream);
-    else
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>::Run(
-          param, stream);
-  } else if (param.custom_mask_type == 2) {
-    if (param.has_attn_bias)
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
-          param, stream);
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>::
+          Run(param, stream);
+    else if (param.custom_mask_type == 1)
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>::
+          Run(param, stream);
+    else if (param.custom_mask_type == 2)
+      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>::
+          Run(param, stream);
     else
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>::Run(
-          param, stream);
-  } else
-    throw std::runtime_error("Invalid custom_mask_type value");
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp
new file mode 100644
index 000000000..ded6fe928
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp
@@ -0,0 +1,11 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp
new file mode 100644
index 000000000..7eb372128
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp
@@ -0,0 +1,11 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp
new file mode 100644
index 000000000..95281e7ba
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp
@@ -0,0 +1,11 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>;

From 375d39c2289c7542815e210a477c8d5f9edbd887 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 20 Oct 2023 23:59:55 +0000
Subject: [PATCH 102/641] Some fixes

---
 .../hip_fmha/attention_backward_generic.cpp    |  2 +-
 ...k_fmha_grouped_backward_bp16_masktype_1.cpp |  1 -
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp  | 18 ++++++++++++------
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp  | 18 ++++++++++++------
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index c142352e0..1d28afd8c 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -394,7 +394,7 @@ efficient_attention_backward_ck(
         ? reinterpret_cast<char*>(grad_bias.data_ptr())
         : nullptr;
 
-    int multiplier = 1;
+    size_t multiplier = 1;
 
     if (p.use_fp32_qkv_grad)
       multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
index 2892cd129..6f5531b75 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
@@ -2,7 +2,6 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_backward.h"
-#include "ck_static_switch.h"
 
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index 00f92bdae..04769122d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -37,14 +37,20 @@ extern template struct grouped_forward_masktype_attnbias_dispatched<
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>::Run(
-          param, stream);
+      grouped_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else if (param.custom_mask_type == 1)
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>::Run(
-          param, stream);
+      grouped_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else if (param.custom_mask_type == 2)
-      grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>::Run(
-          param, stream);
+      grouped_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index e3b0736b8..9c059d9b7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -37,14 +37,20 @@ extern template struct grouped_forward_masktype_attnbias_dispatched<
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>::Run(
-          param, stream);
+      grouped_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else if (param.custom_mask_type == 1)
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>::Run(
-          param, stream);
+      grouped_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else if (param.custom_mask_type == 2)
-      grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>::Run(
-          param, stream);
+      grouped_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>::Run(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });

From 49fddae636f6a8fc10a284fe630224cbd2fc8403 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 21 Oct 2023 23:08:27 +0000
Subject: [PATCH 103/641] Clarify the naming of the tunable scalar_per_vector
 template parameters for infer/forward/backward

---
 .../hip_fmha/ck_fmha_batched_backward.h       | 33 ++++++++++---------
 .../hip_fmha/ck_fmha_batched_forward.h        | 17 +++++-----
 .../hip_fmha/ck_fmha_batched_infer.h          |  7 ++--
 .../hip_fmha/ck_fmha_grouped_backward.h       | 33 ++++++++++---------
 .../hip_fmha/ck_fmha_grouped_forward.h        | 17 +++++-----
 .../hip_fmha/ck_fmha_grouped_infer.h          |  7 ++--
 6 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 317e3b54c..75b572708 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -64,9 +64,10 @@ struct batched_backward_masktype_attnbias_dispatched {
 
   static void Run(BatchedBackwardParams& param, hipStream_t stream) {
     // Tunables
-    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
     if (param.K <= 32 && param.Kv <= 32) {
       using DeviceOpInstance = ck::tensor_operation::device::
@@ -116,21 +117,21 @@ struct batched_backward_masktype_attnbias_dispatched {
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
               S<4, 64, 1>, // BBlockTransfer
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
               1,
               1,
               S<1, 64, 1, 4>,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
 
@@ -183,21 +184,21 @@ struct batched_backward_masktype_attnbias_dispatched {
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
               S<4, 64, 1>, // BBlockTransfer
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
               1,
               2,
               S<1, 32, 1, 8>,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
 
@@ -250,28 +251,28 @@ struct batched_backward_masktype_attnbias_dispatched {
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
               S<4, 64, 1>, // B0BlockTransfer
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
               S<8, 32, 1>, // B1BlockTransfer
               S<0, 2, 1>,
               S<0, 2, 1>,
               1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kB1BlockTransferSrcScalarPerVector, // TUNABLE
               2,
               false,
               1, // CShuffleMXdlPerWavePerShuffle
               4, // CShuffleNXdlPerWavePerShuffle
               S<1, 32, 1, 8>,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index c144cc5f5..c2ecfccd5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -58,9 +58,10 @@ struct batched_forward_masktype_attnbias_dispatched {
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     // Tunables
-    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
     using DeviceOpInstance = ck::tensor_operation::device::
         DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
@@ -110,22 +111,22 @@ struct batched_forward_masktype_attnbias_dispatched {
             S<1, 0, 2>,
             S<1, 0, 2>,
             2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            kABBlockTransferSrcScalarPerVector, // TUNABLE
             8,
             true,
             S<4, 64, 1>, // BBlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
             2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            kABBlockTransferSrcScalarPerVector, // TUNABLE
             8,
             true,
-            Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+            kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
             S<16, 16, 1>, // B1BlockTransfer
             S<0, 2, 1>,
             S<0, 2, 1>,
             1,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            kB1BlockTransferSrcScalarPerVector, // TUNABLE
             2,
             false,
             1, // CShuffleMXdlPerWavePerShuffle
@@ -134,7 +135,7 @@ struct batched_forward_masktype_attnbias_dispatched {
               32,
               1,
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            kCShuffleBlockTransferScalarPerVector, // TUNABLE
             4,
             MaskingSpec>; // MaskingSpecialization
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index e5396d437..53bdaa1e9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -57,7 +57,8 @@ struct batched_infer_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
 
   static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
   template <
@@ -123,7 +124,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,
-          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          kB1BlockTransferSrcScalarPerVector, // TUNABLE
           2,
           false,
           1, // CShuffleMXdlPerWavePerShuffle
@@ -132,7 +133,7 @@ struct batched_infer_masktype_attnbias_dispatched {
             32,
             1,
             8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          kCShuffleBlockTransferScalarPerVector, // TUNABLE
           MaskingSpec>; // MaskingSpecialization
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index e0446bbcb..f4afd8a75 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -65,9 +65,10 @@ struct grouped_backward_masktype_attnbias_dispatched {
 
   static void Run(GroupedBackwardParams& param, hipStream_t stream) {
     // Tunables
-    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1; // 8
-    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1; // 4
-    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1; // 4
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
     if (param.K <= 32 && param.Kv <= 32) {
       using DeviceOpInstance = ck::tensor_operation::device::
@@ -117,21 +118,21 @@ struct grouped_backward_masktype_attnbias_dispatched {
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
               S<4, 64, 1>, // BBlockTransfer
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
               1,
               1,
               S<1, 64, 1, 4>,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
 
@@ -184,21 +185,21 @@ struct grouped_backward_masktype_attnbias_dispatched {
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
               S<4, 64, 1>, // BBlockTransfer
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
               1,
               2,
               S<1, 32, 1, 8>,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
 
@@ -251,28 +252,28 @@ struct grouped_backward_masktype_attnbias_dispatched {
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
               S<4, 64, 1>, // B0BlockTransfer
               S<1, 0, 2>,
               S<1, 0, 2>,
               2,
-              ABBlockTransferSrcScalarPerVector, // TUNABLE
+              kABBlockTransferSrcScalarPerVector, // TUNABLE
               8,
               true,
-              Acc0BiasTransferSrcScalarPerVector, // TUNABLE
+              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
               S<8, 32, 1>, // B1BlockTransfer
               S<0, 2, 1>,
               S<0, 2, 1>,
               1,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kB1BlockTransferSrcScalarPerVector, // TUNABLE
               2,
               false,
               1, // CShuffleMXdlPerWavePerShuffle
               4, // CShuffleNXdlPerWavePerShuffle
               S<1, 32, 1, 8>,
-              B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+              kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 74ebfc5a9..a47cee438 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -59,9 +59,10 @@ struct grouped_forward_masktype_attnbias_dispatched {
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     // Tunables
-    constexpr ck::index_t ABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t B1CShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
     using DeviceOpInstance = ck::tensor_operation::device::
         DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
@@ -111,22 +112,22 @@ struct grouped_forward_masktype_attnbias_dispatched {
             S<1, 0, 2>,
             S<1, 0, 2>,
             2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            kABBlockTransferSrcScalarPerVector, // TUNABLE
             8,
             true,
             S<4, 64, 1>, // BBlockTransfer
             S<1, 0, 2>,
             S<1, 0, 2>,
             2,
-            ABBlockTransferSrcScalarPerVector, // TUNABLE
+            kABBlockTransferSrcScalarPerVector, // TUNABLE
             8,
             true,
-            Acc0BiasTransferSrcScalarPerVector,
+            kAcc0BiasTransferSrcScalarPerVector,
             S<8, 32, 1>, // B1BlockTransfer
             S<0, 2, 1>,
             S<0, 2, 1>,
             1,
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            kB1BlockTransferSrcScalarPerVector, // TUNABLE
             2,
             false,
             1, // CShuffleMXdlPerWavePerShuffle
@@ -135,7 +136,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
               32,
               1,
               8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            B1CShuffleBlockTransferScalarPerVector, // TUNABLE
+            kCShuffleBlockTransferScalarPerVector, // TUNABLE
             1,
             MaskingSpec>; // MaskingSpecialization
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 22faf161b..2101181dc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -58,7 +58,8 @@ struct grouped_infer_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
 
   static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1CShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
   template <
@@ -124,7 +125,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           S<0, 2, 1>,
           S<0, 2, 1>,
           1,
-          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          kB1BlockTransferSrcScalarPerVector, // TUNABLE
           2,
           false,
           1, // CShuffleMXdlPerWavePerShuffle
@@ -133,7 +134,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
             32,
             1,
             8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          kB1CShuffleBlockTransferScalarPerVector, // TUNABLE
+          kCShuffleBlockTransferScalarPerVector, // TUNABLE
           MaskingSpec>; // MaskingSpecialization
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {

From ae3f73ed3ff5f3fce944077642297b737b4b8630 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 22 Oct 2023 11:53:12 +0000
Subject: [PATCH 104/641] Use separate DeviceOpInstance according to head-dim
 size with fmha forward

---
 .../hip_fmha/ck_fmha_batched_forward.h        | 203 +++++++++++-------
 .../hip_fmha/ck_fmha_grouped_forward.h        | 201 ++++++++++-------
 2 files changed, 239 insertions(+), 165 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index c2ecfccd5..c32667315 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -56,90 +56,127 @@ struct batched_forward_masktype_attnbias_dispatched {
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
 
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    // Tunables
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-    using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
-            NumDimG,
-            NumDimM,
-            NumDimN,
-            NumDimK,
-            NumDimO,
-            ADataType,
-            B0DataType,
-            B1DataType,
-            CDataType,
-            GemmDataType,
-            ZDataType,
-            LSEDataType,
-            Acc0BiasDataType,
-            Acc1BiasDataType,
-            AccDataType,
-            CShuffleDataType,
-            AElementOp,
-            B0ElementOp,
-            Acc0ElementOp,
-            B1ElementOp,
-            CElementOp,
-            GemmSpec,
-            TensorSpecA,
-            TensorSpecB0,
-            TensorSpecB1,
-            TensorSpecC,
-            1,
-            256,
-            128, // MPerBlock
-            128, // NPerBlock
-            32, // KPerBlock
-            64, // Gemm1NPerBlock
-            32, // Gemm1KPerBlock
-            8, // AK1
-            8, // BK1
-            2, // B1K1
-            32, // MPerXDL
-            32, // NPerXDL
-            1, // MXdlPerWave
-            4, // NXdlPerWave
-            2, // Gemm1NXdlPerWave
-            1, // DropoutStep
-            S<4, 64, 1>, // ABlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            kABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            S<4, 64, 1>, // BBlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            kABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-            S<16, 16, 1>, // B1BlockTransfer
-            S<0, 2, 1>,
-            S<0, 2, 1>,
+  // Tunables
+  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::
+      DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          kGemm1NPerBlock,
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          kGemm1NXdlPerWave,
+          1, // DropoutStep
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
+          S<16, 16, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          kB1BlockTransferSrcScalarPerVector, // TUNABLE
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          kCShuffleNXdlPerWavePerShuffle,
+          S<1,
+            32,
             1,
-            kB1BlockTransferSrcScalarPerVector, // TUNABLE
-            2,
-            false,
-            1, // CShuffleMXdlPerWavePerShuffle
-            2, // CShuffleNXdlPerWavePerShuffle
-            S<1,
-              32,
-              1,
-              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            kCShuffleBlockTransferScalarPerVector, // TUNABLE
-            4,
-            MaskingSpec>; // MaskingSpecialization
-
-    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          kCShuffleBlockTransferScalarPerVector, // TUNABLE
+          4,
+          MaskingSpec>; // MaskingSpecialization
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    if (param.K <= 32 && param.Kv <= 32) {
+      constexpr ck::index_t kGemm1NPerBlock = 32;
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else if (param.K <= 64 && param.Kv <= 64) {
+      constexpr ck::index_t kGemm1NPerBlock = 64;
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else {
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    };
   };
 
   template <typename DeviceOpInstance>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index a47cee438..c1bb0d3a5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -57,90 +57,127 @@ struct grouped_forward_masktype_attnbias_dispatched {
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
 
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    // Tunables
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-    using DeviceOpInstance = ck::tensor_operation::device::
-        DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
-            NumDimG,
-            NumDimM,
-            NumDimN,
-            NumDimK,
-            NumDimO,
-            ADataType,
-            B0DataType,
-            B1DataType,
-            CDataType,
-            GemmDataType,
-            ZDataType,
-            LSEDataType,
-            Acc0BiasDataType,
-            Acc1BiasDataType,
-            AccDataType,
-            CShuffleDataType,
-            AElementOp,
-            B0ElementOp,
-            Acc0ElementOp,
-            B1ElementOp,
-            CElementOp,
-            GemmSpec,
-            TensorSpecA,
-            TensorSpecB0,
-            TensorSpecB1,
-            TensorSpecC,
-            1,
-            256,
-            128, // MPerBlock
-            128, // NPerBlock
-            32, // KPerBlock
-            128, // Gemm1NPerBlock
-            32, // Gemm1KPerBlock
-            8, // AK1
-            8, // BK1
-            2, // B1K1
-            32, // MPerXDL
-            32, // NPerXDL
-            1, // MXdlPerWave
-            4, // NXdlPerWave
-            4, // Gemm1NXdlPerWave
-            1, // DropoutStep
-            S<4, 64, 1>, // ABlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            kABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            S<4, 64, 1>, // BBlockTransfer
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            kABBlockTransferSrcScalarPerVector, // TUNABLE
-            8,
-            true,
-            kAcc0BiasTransferSrcScalarPerVector,
-            S<8, 32, 1>, // B1BlockTransfer
-            S<0, 2, 1>,
-            S<0, 2, 1>,
+  // Tunables
+  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::
+      DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          ADataType,
+          B0DataType,
+          B1DataType,
+          CDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          CShuffleDataType,
+          AElementOp,
+          B0ElementOp,
+          Acc0ElementOp,
+          B1ElementOp,
+          CElementOp,
+          GemmSpec,
+          TensorSpecA,
+          TensorSpecB0,
+          TensorSpecB1,
+          TensorSpecC,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          32, // KPerBlock
+          kGemm1NPerBlock,
+          32, // Gemm1KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          1, // MXdlPerWave
+          4, // NXdlPerWave
+          kGemm1NXdlPerWave,
+          1, // DropoutStep
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          kAcc0BiasTransferSrcScalarPerVector,
+          S<8, 32, 1>, // B1BlockTransfer
+          S<0, 2, 1>,
+          S<0, 2, 1>,
+          1,
+          kB1BlockTransferSrcScalarPerVector, // TUNABLE
+          2,
+          false,
+          1, // CShuffleMXdlPerWavePerShuffle
+          kCShuffleNXdlPerWavePerShuffle,
+          S<1,
+            32,
             1,
-            kB1BlockTransferSrcScalarPerVector, // TUNABLE
-            2,
-            false,
-            1, // CShuffleMXdlPerWavePerShuffle
-            2, // CShuffleNXdlPerWavePerShuffle
-            S<1,
-              32,
-              1,
-              8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            kCShuffleBlockTransferScalarPerVector, // TUNABLE
-            1,
-            MaskingSpec>; // MaskingSpecialization
+            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+          kCShuffleBlockTransferScalarPerVector, // TUNABLE
+          1,
+          MaskingSpec>; // MaskingSpecialization
 
-    RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    if (param.K <= 32 && param.Kv <= 32) {
+      constexpr ck::index_t kGemm1NPerBlock = 32;
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else if (param.K <= 64 && param.Kv <= 64) {
+      constexpr ck::index_t kGemm1NPerBlock = 64;
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    } else {
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle>;
+
+      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    };
   };
 
   template <typename DeviceOpInstance>

From 060c372f121bf4f4616d646dc66cbd23735e529b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 22 Oct 2023 20:46:18 +0000
Subject: [PATCH 105/641] Select some template parameters as tunables for
 backward

---
 .../hip_fmha/ck_fmha_batched_backward.h       | 234 ++++++++----------
 .../hip_fmha/ck_fmha_grouped_backward.h       | 229 +++++++----------
 2 files changed, 194 insertions(+), 269 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 75b572708..beb93f7c2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -62,145 +62,106 @@ struct batched_backward_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = true;
 
-  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
-    // Tunables
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      typename kCShuffleBlockTransferClusterLengths>
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::
+      DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          InputDataType,
+          OutputDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          ShuffleDataType,
+          QKVElementOp,
+          QKVElementOp,
+          Scale,
+          QKVElementOp,
+          YElementOp,
+          GemmSpec,
+          TensorSpecQ,
+          TensorSpecK,
+          TensorSpecV,
+          TensorSpecY,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          kGemm1NPerBlock, // KPerBlock == kGemm1NPerBlock required
+          kGemm1NPerBlock,
+          32, // Gemm1KperBlock
+          32, // Gemm2KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          4, // MXdlPerWave
+          1, // NXdlPerWave
+          kGemm1NXdlPerWave,
+          1, // Gemm2NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
+          1, // CShuffleMXdlPerWavePerShuffle
+          kCShuffleNXdlPerWavePerShuffle,
+          kCShuffleBlockTransferClusterLengths,
+          kCShuffleBlockTransferScalarPerVector, // TUNABLE
+          MaskingSpec,
+          Deterministic>;
 
+  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
     if (param.K <= 32 && param.Kv <= 32) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              InputDataType,
-              OutputDataType,
-              GemmDataType,
-              ZDataType,
-              LSEDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              ShuffleDataType,
-              QKVElementOp,
-              QKVElementOp,
-              Scale,
-              QKVElementOp,
-              YElementOp,
-              GemmSpec,
-              TensorSpecQ,
-              TensorSpecK,
-              TensorSpecV,
-              TensorSpecY,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              32, // Gemm1NPerBlock
-              32, // Gemm1KperBlock
-              64, // Gemm2KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              4, // MXdlPerWave
-              1, // NXdlPerWave
-              1, // Gemm1NXdlPerWave
-              1, // Gemm2NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-              1,
-              1,
-              S<1, 64, 1, 4>,
-              kCShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec,
-              Deterministic>;
+      constexpr ck::index_t kGemm1NPerBlock = 32;
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle,
+          kCShuffleBlockTransferClusterLengths>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else if (param.K <= 64 && param.Kv <= 64) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              InputDataType,
-              OutputDataType,
-              GemmDataType,
-              ZDataType,
-              LSEDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              ShuffleDataType,
-              QKVElementOp,
-              QKVElementOp,
-              Scale,
-              QKVElementOp,
-              YElementOp,
-              GemmSpec,
-              TensorSpecQ,
-              TensorSpecK,
-              TensorSpecV,
-              TensorSpecY,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              64, // KPerBlock
-              64, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              32, // Gemm2KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              4, // MXdlPerWave
-              1, // NXdlPerWave
-              2, // Gemm1NXdlPerWave
-              1, // Gemm2NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-              1,
-              2,
-              S<1, 32, 1, 8>,
-              kCShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec,
-              Deterministic>;
+      constexpr ck::index_t kGemm1NPerBlock = 64;
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle,
+          kCShuffleBlockTransferClusterLengths>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else {
@@ -271,7 +232,10 @@ struct batched_backward_masktype_attnbias_dispatched {
               false,
               1, // CShuffleMXdlPerWavePerShuffle
               4, // CShuffleNXdlPerWavePerShuffle
-              S<1, 32, 1, 8>,
+              S<1,
+                32,
+                1,
+                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
               kCShuffleBlockTransferScalarPerVector, // TUNABLE
               MaskingSpec,
               Deterministic>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index f4afd8a75..9847b9fa0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -63,145 +63,106 @@ struct grouped_backward_masktype_attnbias_dispatched {
       ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = true;
 
-  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
-    // Tunables
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-    constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
-    constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
+  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      typename kCShuffleBlockTransferClusterLengths>
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::
+      DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+          NumDimG,
+          NumDimM,
+          NumDimN,
+          NumDimK,
+          NumDimO,
+          InputDataType,
+          OutputDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          ShuffleDataType,
+          QKVElementOp,
+          QKVElementOp,
+          Scale,
+          QKVElementOp,
+          YElementOp,
+          GemmSpec,
+          TensorSpecQ,
+          TensorSpecK,
+          TensorSpecV,
+          TensorSpecY,
+          1,
+          256,
+          128, // MPerBlock
+          128, // NPerBlock
+          kGemm1NPerBlock, // KPerBlock = kGemm1NerBlock
+          kGemm1NPerBlock,
+          32, // Gemm1KPerBlock
+          32, // Gemm2KPerBlock
+          8, // AK1
+          8, // BK1
+          2, // B1K1
+          32, // MPerXDL
+          32, // NPerXDL
+          4, // MXdlPerWave
+          1, // NXdlPerWave
+          kGemm1NXdlPerWave,
+          1, // Gemm2NXdlPerWave
+          S<4, 64, 1>, // ABlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          S<4, 64, 1>, // BBlockTransfer
+          S<1, 0, 2>,
+          S<1, 0, 2>,
+          2,
+          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          8,
+          true,
+          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
+          1, // CShuffleMXdlPerWavePerShuffle
+          kCShuffleNXdlPerWavePerShuffle,
+          kCShuffleBlockTransferClusterLengths,
+          kCShuffleBlockTransferScalarPerVector, // TUNABLE
+          MaskingSpec,
+          Deterministic>;
 
+  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
     if (param.K <= 32 && param.Kv <= 32) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              InputDataType,
-              OutputDataType,
-              GemmDataType,
-              ZDataType,
-              LSEDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              ShuffleDataType,
-              QKVElementOp,
-              QKVElementOp,
-              Scale,
-              QKVElementOp,
-              YElementOp,
-              GemmSpec,
-              TensorSpecQ,
-              TensorSpecK,
-              TensorSpecV,
-              TensorSpecY,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              32, // KPerBlock
-              32, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              64, // Gemm2KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              4, // MXdlPerWave
-              1, // NXdlPerWave
-              1, // Gemm1NXdlPerWave
-              1, // Gemm2NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-              1,
-              1,
-              S<1, 64, 1, 4>,
-              kCShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec,
-              Deterministic>;
+      constexpr ck::index_t kGemm1NPerBlock = 32;
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle,
+          kCShuffleBlockTransferClusterLengths>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else if (param.K <= 64 && param.Kv <= 64) {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              InputDataType,
-              OutputDataType,
-              GemmDataType,
-              ZDataType,
-              LSEDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              ShuffleDataType,
-              QKVElementOp,
-              QKVElementOp,
-              Scale,
-              QKVElementOp,
-              YElementOp,
-              GemmSpec,
-              TensorSpecQ,
-              TensorSpecK,
-              TensorSpecV,
-              TensorSpecY,
-              1,
-              256,
-              128, // MPerBlock
-              128, // NPerBlock
-              64, // KPerBlock
-              64, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              32, // Gemm2KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              4, // MXdlPerWave
-              1, // NXdlPerWave
-              2, // Gemm1NXdlPerWave
-              1, // Gemm2NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // BBlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-              1,
-              2,
-              S<1, 32, 1, 8>,
-              kCShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec,
-              Deterministic>;
+      constexpr ck::index_t kGemm1NPerBlock = 64;
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+
+      using DeviceOpInstance = DeviceOpInstanceTemp<
+          kGemm1NPerBlock,
+          kGemm1NXdlPerWave,
+          kCShuffleNXdlPerWavePerShuffle,
+          kCShuffleBlockTransferClusterLengths>;
 
       RunWithDeviceOp<DeviceOpInstance>(param, stream);
     } else {

From e300156595fcaef31c98d9d27d722d057a126002 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 23 Oct 2023 00:07:52 +0000
Subject: [PATCH 106/641] Provide classes to concentratedly define the common
 and default infer-op template parameters

---
 .../hip_fmha/ck_fmha_batched_infer.h          | 117 ++++++++---------
 .../hip_fmha/ck_fmha_device_gemm_constants.h  | 120 ++++++++++++++++++
 .../hip_fmha/ck_fmha_grouped_infer.h          | 113 ++++++++---------
 3 files changed, 222 insertions(+), 128 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 53bdaa1e9..c73108dc9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -9,6 +9,7 @@
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp"
 
+#include "ck_fmha_device_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -29,12 +30,6 @@ struct batched_infer_masktype_attnbias_dispatched {
       typename std::conditional<has_attn_bias, scalar_t, void>::type;
   using Acc1BiasDataType = void;
 
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
   using AElementOp = PassThrough;
   using B0ElementOp = PassThrough;
   using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
@@ -47,15 +42,6 @@ struct batched_infer_masktype_attnbias_dispatched {
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
   static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
   static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
   static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
@@ -67,11 +53,11 @@ struct batched_infer_masktype_attnbias_dispatched {
       ck::index_t kCShuffleNXdlPerWavePerShuffle>
   using DeviceOpInstanceTemp = ck::tensor_operation::device::
       DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
           ADataType,
           B0DataType,
           B1DataType,
@@ -86,55 +72,56 @@ struct batched_infer_masktype_attnbias_dispatched {
           B1ElementOp,
           CElementOp,
           GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
           1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
+          GemmOpConstantsBatchedInfer::BlockSize,
+          GemmOpConstantsBatchedInfer::MPerBlock,
+          GemmOpConstantsBatchedInfer::NPerBlock,
+          GemmOpConstantsBatchedInfer::KPerBlock,
           kGemm1NPerBlock,
-          32,
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
+          GemmOpConstantsBatchedInfer::Gemm1KPerBlock,
+          GemmOpConstantsBatchedInfer::AK1,
+          GemmOpConstantsBatchedInfer::BK1,
+          GemmOpConstantsBatchedInfer::B1K1,
+          GemmOpConstantsBatchedInfer::MPerXDL,
+          GemmOpConstantsBatchedInfer::NPerXDL,
+          GemmOpConstantsBatchedInfer::MXdlPerWave,
+          GemmOpConstantsBatchedInfer::NXdlPerWave,
           kGemm1NXdlPerWave,
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-          S<16, 16, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          kB1BlockTransferSrcScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedInfer::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedInfer::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedInfer::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsBatchedInfer::ABlockLdsExtraM,
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedInfer::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedInfer::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedInfer::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedInfer::BBlockLdsExtraN,
+          kAcc0BiasTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedInfer::B1BlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedInfer::B1BlockTransferSrcVectorDim,
+          kB1BlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedInfer::B1BlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedInfer::B1BlockLdsExtraN,
+          GemmOpConstantsBatchedInfer::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          kCShuffleBlockTransferScalarPerVector, // TUNABLE
-          MaskingSpec>; // MaskingSpecialization
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          kCShuffleBlockTransferScalarPerVector,
+          MaskingSpec>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     if (param.K <= 32 && param.Kv <= 32) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
new file mode 100644
index 000000000..2a14f1300
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <ck/ck.hpp>
+#include "ck_fmha_op_helper.h"
+
+// list the template parameters that is commonly used
+struct GemmOpConstantsCommon {
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsBatchedInfer {
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 32;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 1;
+  static constexpr ck::index_t NXdlPerWave = 4;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static consexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+      S<1, 32, 1, 8>;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsGroupedInfer {
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 32;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 1;
+  static constexpr ck::index_t NXdlPerWave = 4;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector,
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+      S<1, 32, 1, 8>;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+struct GemmOpConstantsForward {};
+
+struct GemmOpConstantsBackward {};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 2101181dc..b6aa53292 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -10,6 +10,7 @@
 #include <ck/utility/sequence.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp"
 
+#include "ck_fmha_device_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -30,12 +31,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
       typename std::conditional<has_attn_bias, scalar_t, void>::type;
   using Acc1BiasDataType = void;
 
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
   using AElementOp = PassThrough;
   using B0ElementOp = PassThrough;
   using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
@@ -48,15 +43,6 @@ struct grouped_infer_masktype_attnbias_dispatched {
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
   static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
   static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
   static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
@@ -68,11 +54,11 @@ struct grouped_infer_masktype_attnbias_dispatched {
       ck::index_t kCShuffleNXdlPerWavePerShuffle>
   using DeviceOpInstanceTemp = ck::tensor_operation::device::
       DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
           ADataType,
           B0DataType,
           B1DataType,
@@ -87,55 +73,56 @@ struct grouped_infer_masktype_attnbias_dispatched {
           B1ElementOp,
           CElementOp,
           GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
           1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
+          GemmOpConstantsGroupedInfer::BlockSize,
+          GemmOpConstantsGroupedInfer::MPerBlock,
+          GemmOpConstantsGroupedInfer::NPerBlock,
+          GemmOpConstantsGroupedInfer::KPerBlock,
           kGemm1NPerBlock,
-          32,
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
+          GemmOpConstantsGroupedInfer::Gemm1KPerBlock,
+          GemmOpConstantsGroupedInfer::AK1,
+          GemmOpConstantsGroupedInfer::BK1,
+          GemmOpConstantsGroupedInfer::B1K1,
+          GemmOpConstantsGroupedInfer::MPerXDL,
+          GemmOpConstantsGroupedInfer::NPerXDL,
+          GemmOpConstantsGroupedInfer::MXdlPerWave,
+          GemmOpConstantsGroupedInfer::NXdlPerWave,
           kGemm1NXdlPerWave,
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
+          GemmOpConstantsGroupedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedInfer::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedInfer::ABlockTransferSrcVectorDim,
           kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
+          GemmOpConstantsGroupedInfer::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsGroupedInfer::ABlockLdsExtraM,
+          GemmOpConstantsGroupedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedInfer::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedInfer::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedInfer::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedInfer::BBlockLdsExtraN,
           kAcc0BiasTransferSrcScalarPerVector,
-          S<8, 32, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          kB1BlockTransferSrcScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
+          GemmOpConstantsGroupedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedInfer::B1BlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedInfer::B1BlockTransferSrcVectorDim,
+          kB1BlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedInfer::B1BlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedInfer::B1BlockLdsExtraN,
+          GemmOpConstantsGroupedInfer::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          kCShuffleBlockTransferScalarPerVector, // TUNABLE
-          MaskingSpec>; // MaskingSpecialization
+          GemmOpConstantsGroupedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          kCShuffleBlockTransferScalarPerVector,
+          MaskingSpec>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     if (param.K <= 32 && param.Kv <= 32) {

From fe37e71572f7ec9e837c11834beeb74b4e044588 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 23 Oct 2023 22:33:05 +0000
Subject: [PATCH 107/641] [Performance] Add A/B0/B1 scalar_per_vector selection
 in inference

---
 .../csrc/attention/hip_fmha/ck_align_switch.h | 145 +++++++++++++
 .../hip_fmha/ck_fmha_batched_infer.h          | 192 +++++++++++++++--
 .../hip_fmha/ck_fmha_device_gemm_constants.h  |   4 +-
 .../hip_fmha/ck_fmha_grouped_infer.h          | 196 ++++++++++++++++--
 4 files changed, 493 insertions(+), 44 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_align_switch.h

diff --git a/xformers/csrc/attention/hip_fmha/ck_align_switch.h b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
new file mode 100644
index 000000000..edd49290b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include <stdexcept>
+
+// assume the maximum alignment is 8 elements
+#define ALIGN_SWITCH_1(CONST_ALIGN_MAX1, CONST_ALIGN_NAME1, LENGTH1, ...)   \
+  [&] {                                                                     \
+    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
+      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
+        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
+        __VA_ARGS__();                                                      \
+      } else {                                                              \
+        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
+          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
+            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
+            __VA_ARGS__();                                                  \
+          } else {                                                          \
+            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
+              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
+                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
+                    CONST_ALIGN_MAX1 / 4;                                   \
+                __VA_ARGS__();                                              \
+              } else {                                                      \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
+                __VA_ARGS__();                                              \
+              };                                                            \
+            }                                                               \
+          };                                                                \
+        }                                                                   \
+      };                                                                    \
+    }                                                                       \
+  }()
+
+// assume the maximum alignment is 8 elements
+#define ALIGN_SWITCH_2(                                                       \
+    CONST_ALIGN_MAX1,                                                         \
+    CONST_ALIGN_NAME1,                                                        \
+    LENGTH1,                                                                  \
+    CONST_ALIGN_MAX2,                                                         \
+    CONST_ALIGN_NAME2,                                                        \
+    LENGTH2,                                                                  \
+    ...)                                                                      \
+  [&] {                                                                       \
+    if constexpr (CONST_ALIGN_MAX1 > 0) {                                     \
+      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                  \
+        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;           \
+        ALIGN_SWITCH_1(                                                       \
+            CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);     \
+      } else {                                                                \
+        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                             \
+          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                        \
+            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;   \
+            ALIGN_SWITCH_1(                                                   \
+                CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
+          } else {                                                            \
+            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                         \
+              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                    \
+                constexpr ck::index_t CONST_ALIGN_NAME1 =                     \
+                    CONST_ALIGN_MAX1 / 4;                                     \
+                ALIGN_SWITCH_1(                                               \
+                    CONST_ALIGN_MAX2,                                         \
+                    CONST_ALIGN_NAME2,                                        \
+                    LENGTH2,                                                  \
+                    ##__VA_ARGS__);                                           \
+              } else {                                                        \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                  \
+                ALIGN_SWITCH_1(                                               \
+                    CONST_ALIGN_MAX2,                                         \
+                    CONST_ALIGN_NAME2,                                        \
+                    LENGTH2,                                                  \
+                    ##__VA_ARGS__);                                           \
+              };                                                              \
+            }                                                                 \
+          };                                                                  \
+        }                                                                     \
+      };                                                                      \
+    }                                                                         \
+  }()
+
+// assume the maximum alignment is 8 elements
+#define ALIGN_SWITCH_3(                                                     \
+    CONST_ALIGN_MAX1,                                                       \
+    CONST_ALIGN_NAME1,                                                      \
+    LENGTH1,                                                                \
+    CONST_ALIGN_MAX2,                                                       \
+    CONST_ALIGN_NAME2,                                                      \
+    LENGTH2,                                                                \
+    CONST_ALIGN_MAX3,                                                       \
+    CONST_ALIGN_NAME3,                                                      \
+    LENGTH3,                                                                \
+    ...)                                                                    \
+  [&] {                                                                     \
+    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
+      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
+        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
+        ALIGN_SWITCH_2(                                                     \
+            CONST_ALIGN_MAX2,                                               \
+            CONST_ALIGN_NAME2,                                              \
+            LENGTH2,                                                        \
+            CONST_ALIGN_MAX3,                                               \
+            CONST_ALIGN_NAME3,                                              \
+            LENGTH3,                                                        \
+            ##__VA_ARGS__);                                                 \
+      } else {                                                              \
+        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
+          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
+            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
+            ALIGN_SWITCH_2(                                                 \
+                CONST_ALIGN_MAX2,                                           \
+                CONST_ALIGN_NAME2,                                          \
+                LENGTH2,                                                    \
+                CONST_ALIGN_MAX3,                                           \
+                CONST_ALIGN_NAME3,                                          \
+                LENGTH3,                                                    \
+                ##__VA_ARGS__);                                             \
+          } else {                                                          \
+            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
+              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
+                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
+                    CONST_ALIGN_MAX1 / 4;                                   \
+                ALIGN_SWITCH_2(                                             \
+                    CONST_ALIGN_MAX2,                                       \
+                    CONST_ALIGN_NAME2,                                      \
+                    LENGTH2,                                                \
+                    CONST_ALIGN_MAX3,                                       \
+                    CONST_ALIGN_NAME3,                                      \
+                    LENGTH3,                                                \
+                    ##__VA_ARGS__);                                         \
+              } else {                                                      \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
+                ALIGN_SWITCH_2(                                             \
+                    CONST_ALIGN_MAX2,                                       \
+                    CONST_ALIGN_NAME2,                                      \
+                    LENGTH2,                                                \
+                    CONST_ALIGN_MAX3,                                       \
+                    CONST_ALIGN_NAME3,                                      \
+                    LENGTH3,                                                \
+                    ##__VA_ARGS__);                                         \
+              };                                                            \
+            }                                                               \
+          };                                                                \
+        }                                                                   \
+      };                                                                    \
+    }                                                                       \
+  }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index c73108dc9..08230212e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -7,8 +7,11 @@
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/math.hpp>
+#include <ck/utility/number.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp"
 
+#include "ck_align_switch.h"
 #include "ck_fmha_device_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
@@ -42,15 +45,15 @@ struct batched_infer_masktype_attnbias_dispatched {
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kB1BlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
   using DeviceOpInstanceTemp = ck::tensor_operation::device::
       DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
           GemmOpConstantsCommon::NumDimG,
@@ -123,41 +126,190 @@ struct batched_infer_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
 
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
     if (param.K <= 32 && param.Kv <= 32) {
       constexpr ck::index_t kGemm1NPerBlock = 32;
       constexpr ck::index_t kGemm1NXdlPerWave = 1;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedInfer::AK1 /
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedInfer::BK1 /
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_b1k1 =
+          GemmOpConstantsBatchedInfer::B1K1 /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_b1k1);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
 
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      ALIGN_SWITCH_3(
+          kABBlockTransferSrcScalarPerVector_max,
+          kABBlockTransferSrcScalarPerVector,
+          param.K,
+          kB1BlockTransferSrcScalarPerVector_max,
+          kB1BlockTransferSrcScalarPerVector,
+          param.Kv,
+          kCShuffleBlockTransferScalarPerVector_max,
+          kCShuffleBlockTransferScalarPerVector,
+          param.Kv,
+          [&] {
+            using DeviceOpInstance = DeviceOpInstanceTemp<
+                kGemm1NPerBlock,
+                kGemm1NXdlPerWave,
+                kCShuffleNXdlPerWavePerShuffle,
+                kABBlockTransferSrcScalarPerVector,
+                kB1BlockTransferSrcScalarPerVector,
+                kCShuffleBlockTransferScalarPerVector>;
+
+            RunWithDeviceOp<DeviceOpInstance>(param, stream);
+          });
     } else if (param.K <= 64 && param.Kv <= 64) {
       constexpr ck::index_t kGemm1NPerBlock = 64;
       constexpr ck::index_t kGemm1NXdlPerWave = 2;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedInfer::AK1 /
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedInfer::BK1 /
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_b1k1 =
+          GemmOpConstantsBatchedInfer::B1K1 /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_b1k1);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
 
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      ALIGN_SWITCH_3(
+          kABBlockTransferSrcScalarPerVector_max,
+          kABBlockTransferSrcScalarPerVector,
+          param.K,
+          kB1BlockTransferSrcScalarPerVector_max,
+          kB1BlockTransferSrcScalarPerVector,
+          param.Kv,
+          kCShuffleBlockTransferScalarPerVector_max,
+          kCShuffleBlockTransferScalarPerVector,
+          param.Kv,
+          [&] {
+            using DeviceOpInstance = DeviceOpInstanceTemp<
+                kGemm1NPerBlock,
+                kGemm1NXdlPerWave,
+                kCShuffleNXdlPerWavePerShuffle,
+                kABBlockTransferSrcScalarPerVector,
+                kB1BlockTransferSrcScalarPerVector,
+                kCShuffleBlockTransferScalarPerVector>;
+
+            RunWithDeviceOp<DeviceOpInstance>(param, stream);
+          });
     } else {
       constexpr ck::index_t kGemm1NPerBlock = 128;
       constexpr ck::index_t kGemm1NXdlPerWave = 4;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedInfer::AK1 /
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedInfer::BK1 /
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
 
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    };
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_b1k1 =
+          GemmOpConstantsBatchedInfer::B1K1 /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_b1k1);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      ALIGN_SWITCH_3(
+          kABBlockTransferSrcScalarPerVector_max,
+          kABBlockTransferSrcScalarPerVector,
+          param.K,
+          kB1BlockTransferSrcScalarPerVector_max,
+          kB1BlockTransferSrcScalarPerVector,
+          param.Kv,
+          kCShuffleBlockTransferScalarPerVector_max,
+          kCShuffleBlockTransferScalarPerVector,
+          param.Kv,
+          [&] {
+            using DeviceOpInstance = DeviceOpInstanceTemp<
+                kGemm1NPerBlock,
+                kGemm1NXdlPerWave,
+                kCShuffleNXdlPerWavePerShuffle,
+                kABBlockTransferSrcScalarPerVector,
+                kB1BlockTransferSrcScalarPerVector,
+                kCShuffleBlockTransferScalarPerVector>;
+
+            RunWithDeviceOp<DeviceOpInstance>(param, stream);
+          });
+    }
   };
 
   template <typename DeviceOpInstance>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
index 2a14f1300..eefb60992 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
@@ -51,7 +51,7 @@ struct GemmOpConstantsBatchedInfer {
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
   static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static consexpr bool BBlockLdsExtraN = true;
+  static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
   using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
   using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
@@ -60,7 +60,7 @@ struct GemmOpConstantsBatchedInfer {
   // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
   static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
   static constexpr bool B1BlockLdsExtraN = false;
-  static ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
   using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
       S<1, 32, 1, 8>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index b6aa53292..04af760a0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -7,9 +7,12 @@
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/math.hpp>
+#include <ck/utility/number.hpp>
 #include <ck/utility/sequence.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp"
 
+#include "ck_align_switch.h"
 #include "ck_fmha_device_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
@@ -43,15 +46,15 @@ struct grouped_infer_masktype_attnbias_dispatched {
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kB1BlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
   using DeviceOpInstanceTemp = ck::tensor_operation::device::
       DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
           GemmOpConstantsCommon::NumDimG,
@@ -124,40 +127,189 @@ struct grouped_infer_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
 
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
     if (param.K <= 32 && param.Kv <= 32) {
       constexpr ck::index_t kGemm1NPerBlock = 32;
       constexpr ck::index_t kGemm1NXdlPerWave = 1;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedInfer::AK1 /
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedInfer::BK1 /
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_b1k1 =
+          GemmOpConstantsBatchedInfer::B1K1 /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_b1k1);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      ALIGN_SWITCH_3(
+          kABBlockTransferSrcScalarPerVector_max,
+          kABBlockTransferSrcScalarPerVector,
+          param.K,
+          kB1BlockTransferSrcScalarPerVector_max,
+          kB1BlockTransferSrcScalarPerVector,
+          param.Kv,
+          kCShuffleBlockTransferScalarPerVector_max,
+          kCShuffleBlockTransferScalarPerVector,
+          param.Kv,
+          [&] {
+            using DeviceOpInstance = DeviceOpInstanceTemp<
+                kGemm1NPerBlock,
+                kGemm1NXdlPerWave,
+                kCShuffleNXdlPerWavePerShuffle,
+                kABBlockTransferSrcScalarPerVector,
+                kB1BlockTransferSrcScalarPerVector,
+                kCShuffleBlockTransferScalarPerVector>;
+
+            RunWithDeviceOp<DeviceOpInstance>(param, stream);
+          });
     } else if (param.K <= 64 && param.Kv <= 64) {
       constexpr ck::index_t kGemm1NPerBlock = 64;
       constexpr ck::index_t kGemm1NXdlPerWave = 2;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedInfer::AK1 /
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedInfer::BK1 /
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_b1k1 =
+          GemmOpConstantsBatchedInfer::B1K1 /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_b1k1);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      ALIGN_SWITCH_3(
+          kABBlockTransferSrcScalarPerVector_max,
+          kABBlockTransferSrcScalarPerVector,
+          param.K,
+          kB1BlockTransferSrcScalarPerVector_max,
+          kB1BlockTransferSrcScalarPerVector,
+          param.Kv,
+          kCShuffleBlockTransferScalarPerVector_max,
+          kCShuffleBlockTransferScalarPerVector,
+          param.Kv,
+          [&] {
+            using DeviceOpInstance = DeviceOpInstanceTemp<
+                kGemm1NPerBlock,
+                kGemm1NXdlPerWave,
+                kCShuffleNXdlPerWavePerShuffle,
+                kABBlockTransferSrcScalarPerVector,
+                kB1BlockTransferSrcScalarPerVector,
+                kCShuffleBlockTransferScalarPerVector>;
+
+            RunWithDeviceOp<DeviceOpInstance>(param, stream);
+          });
     } else {
       constexpr ck::index_t kGemm1NPerBlock = 128;
       constexpr ck::index_t kGemm1NXdlPerWave = 4;
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedInfer::AK1 /
+          GemmOpConstantsBatchedInfer::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedInfer::BK1 /
+          GemmOpConstantsBatchedInfer::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_b1k1 =
+          GemmOpConstantsBatchedInfer::B1K1 /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_b1k1);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      ALIGN_SWITCH_3(
+          kABBlockTransferSrcScalarPerVector_max,
+          kABBlockTransferSrcScalarPerVector,
+          param.K,
+          kB1BlockTransferSrcScalarPerVector_max,
+          kB1BlockTransferSrcScalarPerVector,
+          param.Kv,
+          kCShuffleBlockTransferScalarPerVector_max,
+          kCShuffleBlockTransferScalarPerVector,
+          param.Kv,
+          [&] {
+            using DeviceOpInstance = DeviceOpInstanceTemp<
+                kGemm1NPerBlock,
+                kGemm1NXdlPerWave,
+                kCShuffleNXdlPerWavePerShuffle,
+                kABBlockTransferSrcScalarPerVector,
+                kB1BlockTransferSrcScalarPerVector,
+                kCShuffleBlockTransferScalarPerVector>;
+
+            RunWithDeviceOp<DeviceOpInstance>(param, stream);
+          });
     };
   };
 

From fbe7634e7797e7081905f846f32735808edd1d42 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 23 Oct 2023 22:41:04 +0000
Subject: [PATCH 108/641] Rename ck_static_switch.h to ck_bool_switch.h

---
 .../attention/hip_fmha/{ck_static_switch.h => ck_bool_switch.h} | 0
 .../csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp   | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp   | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp    | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp    | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp   | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp   | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp    | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp    | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp | 2 +-
 13 files changed, 12 insertions(+), 12 deletions(-)
 rename xformers/csrc/attention/hip_fmha/{ck_static_switch.h => ck_bool_switch.h} (100%)

diff --git a/xformers/csrc/attention/hip_fmha/ck_static_switch.h b/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_static_switch.h
rename to xformers/csrc/attention/hip_fmha/ck_bool_switch.h
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 5b6ec3c2b..81615faf9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_backward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index a6f09ea54..3527beba7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_backward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 6deae7724..865c2de58 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_forward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index 7e4b9cb8c..fe8371bb4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_forward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
index 5d44a4e99..095487f92 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
index fa0bdd42d..8e5b01fa0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index 2d18eefe6..709a4328f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_backward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index e06a7dc58..2885df9b5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_backward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index 04769122d..b4b10a60a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_forward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index 9c059d9b7..7c7ef74ad 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_forward.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
index 796372951..4310d4f39 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
index ffc89ed53..9a015601f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
@@ -2,7 +2,7 @@
 #include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
-#include "ck_static_switch.h"
+#include "ck_bool_switch.h"
 
 extern template struct grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,

From f719301f7739fe7d704efce821a64f8b0838824d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 24 Oct 2023 09:30:32 +0000
Subject: [PATCH 109/641] Fix in grouped_infer

---
 .../hip_fmha/ck_fmha_grouped_infer.h          | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 04af760a0..e30d4c06a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -139,12 +139,12 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
 
       constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedInfer::AK1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::AK1 /
+          GemmOpConstantsGroupedInfer::
               ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
       constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedInfer::BK1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::BK1 /
+          GemmOpConstantsGroupedInfer::
               BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
 
       static_assert(
@@ -155,8 +155,8 @@ struct grouped_infer_masktype_attnbias_dispatched {
           min(4, thread_slice_length_ak1);
 
       constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsBatchedInfer::B1K1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::B1K1 /
+          GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_b1k1);
@@ -164,7 +164,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
            kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::
               CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
                   At(I3);
 
@@ -198,12 +198,12 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
 
       constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedInfer::AK1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::AK1 /
+          GemmOpConstantsGroupedInfer::
               ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
       constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedInfer::BK1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::BK1 /
+          GemmOpConstantsGroupedInfer::
               BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
 
       static_assert(
@@ -214,8 +214,8 @@ struct grouped_infer_masktype_attnbias_dispatched {
           min(4, thread_slice_length_ak1);
 
       constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsBatchedInfer::B1K1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::B1K1 /
+          GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_b1k1);
@@ -223,7 +223,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
            kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::
               CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
                   At(I3);
 
@@ -257,12 +257,12 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
 
       constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedInfer::AK1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::AK1 /
+          GemmOpConstantsGroupedInfer::
               ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
       constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedInfer::BK1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::BK1 /
+          GemmOpConstantsGroupedInfer::
               BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
 
       static_assert(
@@ -273,8 +273,8 @@ struct grouped_infer_masktype_attnbias_dispatched {
           min(4, thread_slice_length_ak1);
 
       constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsBatchedInfer::B1K1 /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::B1K1 /
+          GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_b1k1);
@@ -282,7 +282,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
            kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
+          GemmOpConstantsGroupedInfer::
               CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
                   At(I3);
 

From 70c25ca0cdf7668473314f14b94d412db125b51a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 24 Oct 2023 18:46:44 +0000
Subject: [PATCH 110/641] Fix in using align_swith for tuning in infer

---
 .../ck_fmha_backward_gemm_constants.h         |   6 ++
 .../hip_fmha/ck_fmha_batched_infer.h          |  31 +++---
 .../hip_fmha/ck_fmha_common_gemm_constants.h  |  23 ++++
 .../hip_fmha/ck_fmha_device_gemm_constants.h  |   6 +-
 .../hip_fmha/ck_fmha_forward_gemm_constants.h |   6 ++
 .../hip_fmha/ck_fmha_grouped_infer.h          |  31 +++---
 .../hip_fmha/ck_fmha_infer_gemm_constants.h   | 102 ++++++++++++++++++
 7 files changed, 170 insertions(+), 35 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
new file mode 100644
index 000000000..585a83e3a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <ck/ck.hpp>
+#include "ck_fmha_op_helper.h"
+
+struct GemmOpConstantsBackward {};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 08230212e..23e6000cc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -12,7 +12,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp"
 
 #include "ck_align_switch.h"
-#include "ck_fmha_device_gemm_constants.h"
+#include "ck_fmha_common_gemm_constants.h"
+#include "ck_fmha_infer_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -126,6 +127,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
 
+  static constexpr auto I1 = ck::Number<1>{};
   static constexpr auto I2 = ck::Number<2>{};
   static constexpr auto I3 = ck::Number<3>{};
 
@@ -153,12 +155,11 @@ struct batched_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_ak1);
 
-      constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsBatchedInfer::B1K1 /
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_b1k1);
+          min(2, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -168,7 +169,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -212,12 +213,11 @@ struct batched_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_ak1);
 
-      constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsBatchedInfer::B1K1 /
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_b1k1);
+          min(2, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -227,7 +227,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -271,12 +271,11 @@ struct batched_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_ak1);
 
-      constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsBatchedInfer::B1K1 /
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_b1k1);
+          min(2, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -286,7 +285,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
new file mode 100644
index 000000000..654a7f8db
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ck/ck.hpp>
+#include "ck_fmha_op_helper.h"
+
+// list the template parameters that is commonly used
+struct GemmOpConstantsCommon {
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+};
+
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
index eefb60992..e49d6d4dc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
@@ -58,7 +58,7 @@ struct GemmOpConstantsBatchedInfer {
   using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
   static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
   // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 4;
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
@@ -100,12 +100,12 @@ struct GemmOpConstantsGroupedInfer {
   static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
   static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
   using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
   using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
   static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
   // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 4;
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
new file mode 100644
index 000000000..673adbea8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <ck/ck.hpp>
+#include "ck_fmha_op_helper.h"
+
+struct GemmOpConstantsForward {};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index e30d4c06a..f24ed6c7c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -13,7 +13,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp"
 
 #include "ck_align_switch.h"
-#include "ck_fmha_device_gemm_constants.h"
+#include "ck_fmha_common_gemm_constants.h"
+#include "ck_fmha_infer_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -127,6 +128,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
 
+  static constexpr auto I1 = ck::Number<1>{};
   static constexpr auto I2 = ck::Number<2>{};
   static constexpr auto I3 = ck::Number<3>{};
 
@@ -154,12 +156,11 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_ak1);
 
-      constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsGroupedInfer::B1K1 /
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_b1k1);
+          min(2, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -169,7 +170,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -213,12 +214,11 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_ak1);
 
-      constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsGroupedInfer::B1K1 /
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_b1k1);
+          min(2, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -228,7 +228,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -272,12 +272,11 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
           min(4, thread_slice_length_ak1);
 
-      constexpr ck::index_t thread_slice_length_b1k1 =
-          GemmOpConstantsGroupedInfer::B1K1 /
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_b1k1);
+          min(2, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -287,7 +286,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
new file mode 100644
index 000000000..ae66edc1c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <ck/ck.hpp>
+#include "ck_fmha_op_helper.h"
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsBatchedInfer {
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 32;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 1;
+  static constexpr ck::index_t NXdlPerWave = 4;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+      S<1, 32, 1, 8>;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsGroupedInfer {
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 32;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 1;
+  static constexpr ck::index_t NXdlPerWave = 4;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector,
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+      S<1, 32, 1, 8>;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+struct GemmOpConstantsForward {};
+
+struct GemmOpConstantsBackward {};

From 7249076e032f64f9c20c5feea77281b009f55064 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 24 Oct 2023 19:50:03 +0000
Subject: [PATCH 111/641] Split the .cpp files for infer to speed-up the
 compiling

---
 ...k_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp} | 5 -----
 ..._fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp | 9 +++++++++
 ...k_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp} | 5 -----
 ..._fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp | 9 +++++++++
 ...k_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp} | 5 -----
 ..._fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp | 9 +++++++++
 ...k_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp} | 2 --
 ..._fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp | 6 ++++++
 ...k_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp} | 2 --
 ..._fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp | 6 ++++++
 ...k_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp} | 2 --
 ..._fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp | 6 ++++++
 ...k_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp} | 5 -----
 ..._fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp | 9 +++++++++
 ...k_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp} | 5 -----
 ..._fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp | 9 +++++++++
 ...k_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp} | 5 -----
 ..._fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp | 9 +++++++++
 ...k_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp} | 2 --
 ..._fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp | 6 ++++++
 ...k_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp} | 2 --
 ..._fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp | 6 ++++++
 ...k_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp} | 2 --
 ..._fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp | 6 ++++++
 24 files changed, 90 insertions(+), 42 deletions(-)
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_infer_bp16_masktype_0.cpp => ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp} (64%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_infer_bp16_masktype_1.cpp => ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp} (64%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_infer_bp16_masktype_2.cpp => ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp} (64%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_infer_fp16_masktype_0.cpp => ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp} (67%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_infer_fp16_masktype_1.cpp => ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp} (67%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_infer_fp16_masktype_2.cpp => ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp} (67%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_infer_bp16_masktype_0.cpp => ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp} (64%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_infer_bp16_masktype_1.cpp => ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp} (64%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_infer_bp16_masktype_2.cpp => ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp} (64%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_infer_fp16_masktype_0.cpp => ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp} (67%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_infer_fp16_masktype_1.cpp => ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp} (67%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_infer_fp16_masktype_2.cpp => ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp} (67%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
similarity index 64%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
index 7d0a4c910..9e1947e67 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
@@ -3,11 +3,6 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>;
-
 template struct batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..e6c5c49fe
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,9 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
similarity index 64%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
index 5aad14a67..9227f7063 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
@@ -3,11 +3,6 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>;
-
 template struct batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..fab028901
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,9 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
similarity index 64%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
index e0ddb158d..0d7a88e0e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
@@ -3,11 +3,6 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>;
-
 template struct batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..57af33adb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,9 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
similarity index 67%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
index fa3ac06cd..838baed94 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -3,8 +3,6 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
-
 template struct batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..0d5f091c2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,6 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
similarity index 67%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
index ea4833f23..21324abb5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -3,8 +3,6 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
-
 template struct batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..0e8a8c384
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,6 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
similarity index 67%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
index 54c046e61..19b4aa0f7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -3,8 +3,6 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
-
 template struct batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..e471b0550
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,6 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_infer.h"
+
+template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
similarity index 64%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
index 6b6658de6..67b1dae7c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
@@ -3,11 +3,6 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>;
-
 template struct grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..343ba049d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,9 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
similarity index 64%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
index 232517d2b..c42bacba3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
@@ -3,11 +3,6 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>;
-
 template struct grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..fc9563043
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,9 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
similarity index 64%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
index 19e58447a..2599755a0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
@@ -3,11 +3,6 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>;
-
 template struct grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..bf9183e86
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,9 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
similarity index 67%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
index ded6fe928..39b4a9adf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -3,8 +3,6 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
-
 template struct grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..7bda05420
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,6 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
similarity index 67%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
index 7eb372128..34c2c97c0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -3,8 +3,6 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
-
 template struct grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..66c2d5724
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,6 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
similarity index 67%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
index 95281e7ba..ab0d8176d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -3,8 +3,6 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
-
 template struct grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..8bcb37f74
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,6 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_grouped_infer.h"
+
+template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;

From 13720780b9ce07a4a0a6beafd3915a0743d2fcef Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 24 Oct 2023 20:12:37 +0000
Subject: [PATCH 112/641] Relax the scope for
 kB1BlockTransferSrcScalarPerVector

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h | 6 +++---
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 23e6000cc..74cc0e8bf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -159,7 +159,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
+          min(4, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -217,7 +217,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
+          min(4, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -275,7 +275,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
+          min(4, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index f24ed6c7c..731ad7f78 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -160,7 +160,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
+          min(4, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -218,7 +218,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
+          min(4, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
@@ -276,7 +276,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
+          min(4, thread_slice_length_gemm1n);
 
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /

From 43db51689e992c022d54aa78e789995329e81758 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 24 Oct 2023 21:58:01 +0000
Subject: [PATCH 113/641] Relax the scope for
 kCShuffleBlockTransferScalarPerVector

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h | 6 +++---
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 74cc0e8bf..7794b5ee0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -169,7 +169,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(2, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -227,7 +227,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(2, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -285,7 +285,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(2, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 731ad7f78..579841b57 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -170,7 +170,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(2, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -228,7 +228,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(2, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,
@@ -286,7 +286,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(2, thread_slice_length_cshuflle_n);
 
       ALIGN_SWITCH_3(
           kABBlockTransferSrcScalarPerVector_max,

From d37bc3046345d9a02cafba6453d2e02fe3aef8bc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 25 Oct 2023 23:26:24 +0000
Subject: [PATCH 114/641] Split the .cpp files for forward to speed-up the
 compiling

---
 ...k_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp} | 7 -------
 ..._fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp | 7 +++++++
 ...k_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp} | 7 -------
 ..._fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp | 7 +++++++
 ...k_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp} | 7 -------
 ..._fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp | 7 +++++++
 ...k_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp} | 7 -------
 ..._fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp | 7 +++++++
 ...k_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp} | 7 -------
 ..._fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp | 7 +++++++
 ...k_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp} | 7 -------
 ..._fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp | 7 +++++++
 ...k_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp} | 7 -------
 ..._fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp | 7 +++++++
 ...k_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp} | 7 -------
 ..._fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp | 7 +++++++
 ...k_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp} | 7 -------
 ..._fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp | 7 +++++++
 ...k_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp} | 7 -------
 ..._fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp | 7 +++++++
 ...k_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp} | 7 -------
 ..._fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp | 7 +++++++
 ...k_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp} | 7 -------
 ..._fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp | 7 +++++++
 24 files changed, 84 insertions(+), 84 deletions(-)
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_forward_bp16_masktype_0.cpp => ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_forward_bp16_masktype_1.cpp => ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_forward_bp16_masktype_2.cpp => ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_forward_fp16_masktype_0.cpp => ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_forward_fp16_masktype_1.cpp => ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_forward_fp16_masktype_2.cpp => ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_forward_bp16_masktype_0.cpp => ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_forward_bp16_masktype_1.cpp => ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_forward_bp16_masktype_2.cpp => ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_forward_fp16_masktype_0.cpp => ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_forward_fp16_masktype_1.cpp => ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_forward_fp16_masktype_2.cpp => ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp} (56%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
index 3813bfbe2..be1d4f58d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>;
-
 template struct batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..54091ff9b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
index 7ea33a2a9..8f2778fd6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>;
-
 template struct batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..da35f17b9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
index 732704f62..f775af0d6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>;
-
 template struct batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..ad9950d93
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
index a9fbc47d7..8af5e20f8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>;
-
 template struct batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..22568941d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
index 7712f091f..466dcc9a3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>;
-
 template struct batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..7346ec804
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
index 45874124e..c7f68924b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>;
-
 template struct batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..d7a5106f8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template struct batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
index 55629443b..8083cb25c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>;
-
 template struct grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..a0d3681f1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
index c1ed66880..f877be39f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>;
-
 template struct grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..aca8091ab
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
index e41a76278..f9ade6d61 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>;
-
 template struct grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..0014a5e69
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
index 3a2c45e6f..3d62db850 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>;
-
 template struct grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..1b80b483c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
index 83b62defc..26d5bccd1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>;
-
 template struct grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..3eae0ae71
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
similarity index 56%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
index 7ef8f40a2..9bba3eeca 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
@@ -1,13 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>;
-
 template struct grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..2d5152e87
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template struct grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>;

From 3aeda8ea363a303ceefa8329d49645a5702daddc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 25 Oct 2023 23:59:59 +0000
Subject: [PATCH 115/641] Split the .cpp files for backward to speed-up the
 compiling

---
 ...tched_backward_bp16_masktype_0_no_attnbias.cpp} | 14 --------------
 ...ched_backward_bp16_masktype_0_with_attnbias.cpp | 14 ++++++++++++++
 ...tched_backward_bp16_masktype_1_no_attnbias.cpp} | 14 --------------
 ...ched_backward_bp16_masktype_1_with_attnbias.cpp | 14 ++++++++++++++
 ...tched_backward_bp16_masktype_2_no_attnbias.cpp} | 14 --------------
 ...ched_backward_bp16_masktype_2_with_attnbias.cpp | 14 ++++++++++++++
 ...tched_backward_fp16_masktype_0_no_attnbias.cpp} | 14 --------------
 ...ched_backward_fp16_masktype_0_with_attnbias.cpp | 14 ++++++++++++++
 ...atched_backward_fp16_masktype_1_no_attnbias.cpp | 14 ++++++++++++++
 ...hed_backward_fp16_masktype_1_with_attnbias.cpp} | 12 ------------
 ...tched_backward_fp16_masktype_2_no_attnbias.cpp} | 14 --------------
 ...ched_backward_fp16_masktype_2_with_attnbias.cpp | 14 ++++++++++++++
 ...ouped_backward_bp16_masktype_0_no_attnbias.cpp} | 14 --------------
 ...uped_backward_bp16_masktype_0_with_attnbias.cpp | 14 ++++++++++++++
 ...ouped_backward_bp16_masktype_1_no_attnbias.cpp} | 14 --------------
 ...uped_backward_bp16_masktype_1_with_attnbias.cpp | 14 ++++++++++++++
 ...ouped_backward_bp16_masktype_2_no_attnbias.cpp} | 14 --------------
 ...uped_backward_bp16_masktype_2_with_attnbias.cpp | 14 ++++++++++++++
 ...ouped_backward_fp16_masktype_0_no_attnbias.cpp} | 14 --------------
 ...uped_backward_fp16_masktype_0_with_attnbias.cpp | 14 ++++++++++++++
 ...ouped_backward_fp16_masktype_1_no_attnbias.cpp} | 14 --------------
 ...uped_backward_fp16_masktype_1_with_attnbias.cpp | 14 ++++++++++++++
 ...ouped_backward_fp16_masktype_2_no_attnbias.cpp} | 14 --------------
 ...uped_backward_fp16_masktype_2_with_attnbias.cpp | 14 ++++++++++++++
 24 files changed, 168 insertions(+), 166 deletions(-)
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_backward_bp16_masktype_0.cpp => ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_backward_bp16_masktype_1.cpp => ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_backward_bp16_masktype_2.cpp => ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_backward_fp16_masktype_0.cpp => ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_backward_fp16_masktype_1.cpp => ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp} (57%)
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_batched_backward_fp16_masktype_2.cpp => ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_backward_bp16_masktype_0.cpp => ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_backward_bp16_masktype_1.cpp => ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_backward_bp16_masktype_2.cpp => ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_backward_fp16_masktype_0.cpp => ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_backward_fp16_masktype_1.cpp => ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/{ck_fmha_grouped_backward_fp16_masktype_2.cpp => ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp} (53%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
index 3b27b27f7..52541f380 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>;
-
 template struct batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..7bf0a5959
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
index a59443dc0..6420ddf15 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>;
-
 template struct batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..b10c2895c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
index 28396507c..aca4acbf2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>;
-
 template struct batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..c8ef03050
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
index 6b6d09949..6421a77b3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>;
-
 template struct batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..7e7bc9ad4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..cbfa45b67
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
similarity index 57%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
index c11fb2535..dc2df739a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
@@ -14,15 +14,3 @@ template struct batched_backward_masktype_attnbias_dispatched<
     1,
     true,
     false>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
index 9dc0df5e9..1f77acb1c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>;
-
-template struct batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>;
-
 template struct batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..5743fb768
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>;
+
+template struct batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
index 703176268..558cd3d68 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>;
-
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>;
-
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..52e36a445
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
index 6f5531b75..47e5e97e5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>;
-
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>;
-
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..542226d72
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
index 535ea659d..833c49504 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>;
-
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>;
-
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..6772bbac7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
index 409c2d159..85d0fbfd7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>;
-
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>;
-
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..69a3839e7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
index 9662fe529..7e826ab00 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>;
-
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>;
-
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..1235af9a6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
similarity index 53%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp
rename to xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
index d13fd9b05..1cec428a6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,20 +1,6 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
-
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>;
-
-template struct grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>;
-
 template struct grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..c01bea26b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>;
+
+template struct grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>;

From 0e237d8b5b01ecca78c923ecde0ec825a8814792 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 26 Oct 2023 14:55:31 +0000
Subject: [PATCH 116/641] Move to the latest composable_kernel commit and
 corresponding API adapting

---
 third_party/composable_kernel                        |  2 +-
 .../attention/hip_fmha/ck_fmha_batched_backward.h    | 12 ++++++++++++
 .../attention/hip_fmha/ck_fmha_grouped_backward.h    | 12 ++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index f27f91581..4033f5df2 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit f27f91581162c788f144f0f4f9aa68fa465a33fc
+Subproject commit 4033f5df2de7a3e778fced14041304d6fc20d673
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index beb93f7c2..50d0761a6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -264,6 +264,10 @@ struct batched_backward_masktype_attnbias_dispatched {
         param.k_strides[1],
         param.k_strides[3]};
 
+    // ToDo: support multi-query and group-query attention
+    std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = k_gs_ns_ks_lengths;
+    std::vector<ck::index_t> kgrad_gs_ns_ks_strides = k_gs_ns_ks_strides;
+
     std::vector<ck::index_t> v_gs_os_ns_lengths{
         param.B, param.num_heads, param.Kv, param.N};
     std::vector<ck::index_t> v_gs_os_ns_strides{
@@ -272,6 +276,10 @@ struct batched_backward_masktype_attnbias_dispatched {
         param.v_strides[3],
         param.v_strides[1]};
 
+    // ToDo: support multi-query and group-query attention
+    std::vector<ck::index_t> vgrad_gs_os_ns_lengths = v_gs_os_ns_lengths;
+    std::vector<ck::index_t> vgrad_gs_os_ns_strides = v_gs_os_ns_strides;
+
     std::vector<ck::index_t> y_gs_ms_os_lengths{
         param.B, param.num_heads, param.M, param.Kv};
     std::vector<ck::index_t> y_gs_ms_os_strides{
@@ -329,6 +337,10 @@ struct batched_backward_masktype_attnbias_dispatched {
         y_gs_ms_os_lengths, // y, dY should have same shape
         y_gs_ms_os_strides,
         lse_gs_ms_lengths,
+        kgrad_gs_ns_ks_lengths,
+        kgrad_gs_ns_ks_strides,
+        vgrad_gs_os_ns_lengths,
+        vgrad_gs_os_ns_strides,
         d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
         d_gs_ms_ns_strides,
         {}, // acc1_biases_gs_ms_os_lengths
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 9847b9fa0..0de98ed0c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -267,11 +267,19 @@ struct grouped_backward_masktype_attnbias_dispatched {
       std::vector<ck::index_t> k_gs_ns_ks_strides{
           0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
 
+      // ToDo: support multi-query and group-query attention
+      std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = k_gs_ns_ks_lengths;
+      std::vector<ck::index_t> kgrad_gs_ns_ks_strides = k_gs_ns_ks_strides;
+
       // to be changed to v_gs_ns_os_lengths
       std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1, Kv, N};
       std::vector<ck::index_t> v_gs_os_ns_strides{
           0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
 
+      // ToDo: support multi-query and group-query attention
+      std::vector<ck::index_t> vgrad_gs_os_ns_lengths = v_gs_os_ns_lengths;
+      std::vector<ck::index_t> vgrad_gs_os_ns_strides = v_gs_os_ns_strides;
+
       std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1, M, Kv};
       std::vector<ck::index_t> y_gs_ms_os_strides{
           0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
@@ -308,6 +316,10 @@ struct grouped_backward_masktype_attnbias_dispatched {
           y_gs_ms_os_strides,
           lse_gs_ms_lengths,
           lse_gs_ms_strides,
+          kgrad_gs_ns_ks_lengths,
+          kgrad_gs_ns_ks_strides,
+          vgrad_gs_os_ns_lengths,
+          vgrad_gs_os_ns_strides,
           d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
           d_gs_ms_ns_strides,
           {}, // acc1_biases_gs_ms_os_lengths

From 60c33f2b18899a22aff3e4b5f1688e5b1bc966c6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 26 Oct 2023 17:27:44 +0000
Subject: [PATCH 117/641] Remove un-used header file

---
 .../hip_fmha/ck_fmha_device_gemm_constants.h  | 120 ------------------
 1 file changed, 120 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
deleted file mode 100644
index e49d6d4dc..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_device_gemm_constants.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#pragma once
-
-#include <ck/ck.hpp>
-#include "ck_fmha_op_helper.h"
-
-// list the template parameters that is commonly used
-struct GemmOpConstantsCommon {
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-};
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedInfer {
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 32;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 1;
-  static constexpr ck::index_t NXdlPerWave = 4;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 4;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 32, 1, 8>;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedInfer {
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 32;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 1;
-  static constexpr ck::index_t NXdlPerWave = 4;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector,
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 4;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 32, 1, 8>;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-
-struct GemmOpConstantsForward {};
-
-struct GemmOpConstantsBackward {};

From ae2545099c57779899b67f0976e250d4aadf109c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 26 Oct 2023 17:29:49 +0000
Subject: [PATCH 118/641] Remove un-used codes in
 benchmark_mem_eff_attention_ck.py

---
 xformers/benchmarks/benchmark_mem_eff_attention_ck.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
index bd700518d..0c754d8c1 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
@@ -176,14 +176,6 @@ def create_tensors(shape, dtype, requires_grad=False):
     q, k, v = xformers.ops.unbind(qkv, 2)
     return qkv, q, k, v
 
-def create_discrete_tensors(shape, dtype, requires_grad=False):
-    B, M, H, K = shape
-    q = torch.rand([B, M, H, K], device=device, dtype=dtype, requires_grad=requires_grad)
-    k = torch.rand([B, M, H, K], device=device, dtype=dtype, requires_grad=requires_grad)
-    v = torch.rand([B, M, H, K], device=device, dtype=dtype, requires_grad=requires_grad)
-
-    return q, k, v
-
 def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
     B, M, H, K = shape
     _, q, k, v = create_tensors(shape, dtype)

From 0d21bf86a211db5e32197a3c3ca5d4ed38c96b38 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 27 Oct 2023 10:39:52 +0000
Subject: [PATCH 119/641] [Performance] Add A/B0/B1/C scalar_per_vector
 selection in forward

---
 .../hip_fmha/ck_fmha_batched_forward.h        | 269 +++++++++++------
 .../hip_fmha/ck_fmha_forward_gemm_constants.h | 102 ++++++-
 .../hip_fmha/ck_fmha_grouped_forward.h        | 281 +++++++++++-------
 3 files changed, 451 insertions(+), 201 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index c32667315..0307d47a5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -5,10 +5,15 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp"
+#include <ck/utility/math.hpp>
+#include <ck/utility/number.hpp>
 
+#include "ck_align_switch.h"
+#include "ck_fmha_common_gemm_constants.h"
+#include "ck_fmha_forward_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -56,23 +61,44 @@ struct batched_forward_masktype_attnbias_dispatched {
   static constexpr auto TensorSpecC =
       ck::tensor_operation::device::TensorSpecialization::Default;
 
-  // Tunables
-  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
+#ifndef BATCHED_FORWARD_HEADDIM_SWITCH
+#define BATCHED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                           \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+      __VA_ARGS__();                                              \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+      __VA_ARGS__();                                              \
+    } else {                                                      \
+      constexpr ck::index_t kGemm1NPerBlock = 128;                \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+      __VA_ARGS__();                                              \
+    }                                                             \
+  }()
+#endif
+
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kB1BlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
   using DeviceOpInstanceTemp = ck::tensor_operation::device::
       DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
           ADataType,
           B0DataType,
           B1DataType,
@@ -90,93 +116,150 @@ struct batched_forward_masktype_attnbias_dispatched {
           B1ElementOp,
           CElementOp,
           GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
+          GemmOpConstantsBatchedForward::NumGemmKPrefetchStage,
+          GemmOpConstantsBatchedForward::BlockSize,
+          GemmOpConstantsBatchedForward::MPerBlock,
+          GemmOpConstantsBatchedForward::NPerBlock,
+          GemmOpConstantsBatchedForward::KPerBlock,
           kGemm1NPerBlock,
-          32, // Gemm1KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
+          GemmOpConstantsBatchedForward::Gemm1KPerBlock,
+          GemmOpConstantsBatchedForward::AK1,
+          GemmOpConstantsBatchedForward::BK1,
+          GemmOpConstantsBatchedForward::B1K1,
+          GemmOpConstantsBatchedForward::MPerXDL,
+          GemmOpConstantsBatchedForward::NPerXDL,
+          GemmOpConstantsBatchedForward::MXdlPerWave,
+          GemmOpConstantsBatchedForward::NXdlPerWave,
           kGemm1NXdlPerWave,
-          1, // DropoutStep
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-          S<16, 16, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          kB1BlockTransferSrcScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
+          GemmOpConstantsBatchedForward::DropoutStep,
+          GemmOpConstantsBatchedForward::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsBatchedForward::
+              ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedForward::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedForward::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedForward::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsBatchedForward::ABlockLdsExtraM,
+          GemmOpConstantsBatchedForward::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedForward::
+              BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedForward::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedForward::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedForward::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedForward::BBlockLdsExtraN,
+          kAcc0BiasTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedForward::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedForward::
+              B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedForward::B1BlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedForward::B1BlockTransferSrcVectorDim,
+          kB1BlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedForward::B1BlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedForward::B1BlockLdsExtraN,
+          GemmOpConstantsBatchedForward::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          kCShuffleBlockTransferScalarPerVector, // TUNABLE
-          4,
-          MaskingSpec>; // MaskingSpecialization
-
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    if (param.K <= 32 && param.Kv <= 32) {
-      constexpr ck::index_t kGemm1NPerBlock = 32;
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
-
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else if (param.K <= 64 && param.Kv <= 64) {
-      constexpr ck::index_t kGemm1NPerBlock = 64;
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
-
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+          GemmOpConstantsBatchedForward::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          kCShuffleBlockTransferScalarPerVector,
+          GemmOpConstantsBatchedForward::Acc1BiasTransferSrcScalarPerVector,
+          MaskingSpec>;
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
 
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    };
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedForward::AK1 /
+          GemmOpConstantsBatchedForward::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedForward::BK1 /
+          GemmOpConstantsBatchedForward::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsBatchedForward::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedForward::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(2, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
   };
 
   template <typename DeviceOpInstance>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index 673adbea8..ab72b87cf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -3,4 +3,104 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_op_helper.h"
 
-struct GemmOpConstantsForward {};
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsBatchedForward {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 32;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 1;
+  static constexpr ck::index_t NXdlPerWave = 4;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t DropoutStep = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+      S<1, 32, 1, 8>;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector =
+      1; // not actually used by the kernel
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsGroupedForward {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 32;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 1;
+  static constexpr ck::index_t NXdlPerWave = 4;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t DropoutStep = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+      S<1, 32, 1, 8>;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector =
+      1; // not actually used by the kernel
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index c1bb0d3a5..a61237014 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -8,8 +8,12 @@
 #include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/sequence.hpp>
+#include <ck/utility/math.hpp>
+#include <ck/utility/number.hpp>
 
+#include "ck_align_switch.h"
+#include "ck_fmha_common_gemm_constants.h"
+#include "ck_fmha_forward_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -30,12 +34,6 @@ struct grouped_forward_masktype_attnbias_dispatched {
       typename std::conditional<has_attn_bias, scalar_t, void>::type;
   using Acc1BiasDataType = void;
 
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
   using AElementOp = PassThrough;
   using B0ElementOp = PassThrough;
   using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
@@ -48,32 +46,44 @@ struct grouped_forward_masktype_attnbias_dispatched {
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
-  // Tunables
-  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
+#ifndef GROUPED_FORWARD_HEADDIM_SWITCH
+#define GROUPED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                           \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+      __VA_ARGS__();                                              \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+      __VA_ARGS__();                                              \
+    } else {                                                      \
+      constexpr ck::index_t kGemm1NPerBlock = 128;                \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+      __VA_ARGS__();                                              \
+    }                                                             \
+  }()
+#endif
+
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle>
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kB1BlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
   using DeviceOpInstanceTemp = ck::tensor_operation::device::
       DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
           ADataType,
           B0DataType,
           B1DataType,
@@ -91,93 +101,150 @@ struct grouped_forward_masktype_attnbias_dispatched {
           B1ElementOp,
           CElementOp,
           GemmSpec,
-          TensorSpecA,
-          TensorSpecB0,
-          TensorSpecB1,
-          TensorSpecC,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
-          32, // KPerBlock
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
+          GemmOpConstantsGroupedForward::NumGemmKPrefetchStage,
+          GemmOpConstantsGroupedForward::BlockSize,
+          GemmOpConstantsGroupedForward::MPerBlock,
+          GemmOpConstantsGroupedForward::NPerBlock,
+          GemmOpConstantsGroupedForward::KPerBlock,
           kGemm1NPerBlock,
-          32, // Gemm1KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          1, // MXdlPerWave
-          4, // NXdlPerWave
+          GemmOpConstantsGroupedForward::Gemm1KPerBlock,
+          GemmOpConstantsGroupedForward::AK1,
+          GemmOpConstantsGroupedForward::BK1,
+          GemmOpConstantsGroupedForward::B1K1,
+          GemmOpConstantsGroupedForward::MPerXDL,
+          GemmOpConstantsGroupedForward::NPerXDL,
+          GemmOpConstantsGroupedForward::MXdlPerWave,
+          GemmOpConstantsGroupedForward::NXdlPerWave,
           kGemm1NXdlPerWave,
-          1, // DropoutStep
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
+          GemmOpConstantsGroupedForward::DropoutStep,
+          GemmOpConstantsGroupedForward::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsGroupedForward::
+              ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedForward::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedForward::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedForward::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsGroupedForward::ABlockLdsExtraM,
+          GemmOpConstantsGroupedForward::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedForward::
+              BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedForward::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedForward::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedForward::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedForward::BBlockLdsExtraN,
           kAcc0BiasTransferSrcScalarPerVector,
-          S<8, 32, 1>, // B1BlockTransfer
-          S<0, 2, 1>,
-          S<0, 2, 1>,
-          1,
-          kB1BlockTransferSrcScalarPerVector, // TUNABLE
-          2,
-          false,
-          1, // CShuffleMXdlPerWavePerShuffle
+          GemmOpConstantsGroupedForward::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedForward::
+              B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedForward::B1BlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedForward::B1BlockTransferSrcVectorDim,
+          kB1BlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedForward::B1BlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedForward::B1BlockLdsExtraN,
+          GemmOpConstantsGroupedForward::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          S<1,
-            32,
-            1,
-            8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-          kCShuffleBlockTransferScalarPerVector, // TUNABLE
-          1,
-          MaskingSpec>; // MaskingSpecialization
+          GemmOpConstantsGroupedForward::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          kCShuffleBlockTransferScalarPerVector,
+          GemmOpConstantsGroupedForward::Acc1BiasTransferSrcScalarPerVector,
+          MaskingSpec>;
 
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    if (param.K <= 32 && param.Kv <= 32) {
-      constexpr ck::index_t kGemm1NPerBlock = 32;
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else if (param.K <= 64 && param.Kv <= 64) {
-      constexpr ck::index_t kGemm1NPerBlock = 64;
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
-
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    };
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedForward::AK1 /
+          GemmOpConstantsBatchedForward::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedForward::BK1 /
+          GemmOpConstantsBatchedForward::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsBatchedForward::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedForward::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(2, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
   };
 
   template <typename DeviceOpInstance>

From c3270c4e40e733b138223c453c6c2bc54f7b1d60 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 27 Oct 2023 11:54:54 +0000
Subject: [PATCH 120/641] Use compile-time checking(constexpr) to reduce the
 number of compiled instances in inference

---
 .../hip_fmha/ck_fmha_batched_infer.h          | 216 ++++++-----------
 .../hip_fmha/ck_fmha_grouped_infer.h          | 217 ++++++------------
 2 files changed, 144 insertions(+), 289 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 7794b5ee0..6fddd553c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -5,11 +5,11 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/math.hpp>
 #include <ck/utility/number.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp"
 
 #include "ck_align_switch.h"
 #include "ck_fmha_common_gemm_constants.h"
@@ -48,6 +48,28 @@ struct batched_infer_masktype_attnbias_dispatched {
 
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
+#ifndef BATCHED_INFER_HEADDIM_SWITCH
+#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                         \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
+      constexpr ck::index_t kGemm1NPerBlock = 32;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
+      __VA_ARGS__();                                            \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
+      constexpr ck::index_t kGemm1NPerBlock = 64;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
+      __VA_ARGS__();                                            \
+    } else {                                                    \
+      constexpr ck::index_t kGemm1NPerBlock = 128;              \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
+      __VA_ARGS__();                                            \
+    }                                                           \
+  }()
+#endif
+
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -134,69 +156,7 @@ struct batched_infer_masktype_attnbias_dispatched {
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     using ck::math::min;
 
-    if (param.K <= 32 && param.Kv <= 32) {
-      constexpr ck::index_t kGemm1NPerBlock = 32;
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedInfer::AK1 /
-          GemmOpConstantsBatchedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedInfer::BK1 /
-          GemmOpConstantsBatchedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      ALIGN_SWITCH_3(
-          kABBlockTransferSrcScalarPerVector_max,
-          kABBlockTransferSrcScalarPerVector,
-          param.K,
-          kB1BlockTransferSrcScalarPerVector_max,
-          kB1BlockTransferSrcScalarPerVector,
-          param.Kv,
-          kCShuffleBlockTransferScalarPerVector_max,
-          kCShuffleBlockTransferScalarPerVector,
-          param.Kv,
-          [&] {
-            using DeviceOpInstance = DeviceOpInstanceTemp<
-                kGemm1NPerBlock,
-                kGemm1NXdlPerWave,
-                kCShuffleNXdlPerWavePerShuffle,
-                kABBlockTransferSrcScalarPerVector,
-                kB1BlockTransferSrcScalarPerVector,
-                kCShuffleBlockTransferScalarPerVector>;
-
-            RunWithDeviceOp<DeviceOpInstance>(param, stream);
-          });
-    } else if (param.K <= 64 && param.Kv <= 64) {
-      constexpr ck::index_t kGemm1NPerBlock = 64;
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
-
+    BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_ak1 =
           GemmOpConstantsBatchedInfer::AK1 /
           GemmOpConstantsBatchedInfer::
@@ -229,86 +189,54 @@ struct batched_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
           min(2, thread_slice_length_cshuflle_n);
 
-      ALIGN_SWITCH_3(
-          kABBlockTransferSrcScalarPerVector_max,
-          kABBlockTransferSrcScalarPerVector,
-          param.K,
-          kB1BlockTransferSrcScalarPerVector_max,
-          kB1BlockTransferSrcScalarPerVector,
-          param.Kv,
-          kCShuffleBlockTransferScalarPerVector_max,
-          kCShuffleBlockTransferScalarPerVector,
-          param.Kv,
-          [&] {
-            using DeviceOpInstance = DeviceOpInstanceTemp<
-                kGemm1NPerBlock,
-                kGemm1NXdlPerWave,
-                kCShuffleNXdlPerWavePerShuffle,
-                kABBlockTransferSrcScalarPerVector,
-                kB1BlockTransferSrcScalarPerVector,
-                kCShuffleBlockTransferScalarPerVector>;
-
-            RunWithDeviceOp<DeviceOpInstance>(param, stream);
-          });
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedInfer::AK1 /
-          GemmOpConstantsBatchedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedInfer::BK1 /
-          GemmOpConstantsBatchedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      ALIGN_SWITCH_3(
-          kABBlockTransferSrcScalarPerVector_max,
-          kABBlockTransferSrcScalarPerVector,
-          param.K,
-          kB1BlockTransferSrcScalarPerVector_max,
-          kB1BlockTransferSrcScalarPerVector,
-          param.Kv,
-          kCShuffleBlockTransferScalarPerVector_max,
-          kCShuffleBlockTransferScalarPerVector,
-          param.Kv,
-          [&] {
-            using DeviceOpInstance = DeviceOpInstanceTemp<
-                kGemm1NPerBlock,
-                kGemm1NXdlPerWave,
-                kCShuffleNXdlPerWavePerShuffle,
-                kABBlockTransferSrcScalarPerVector,
-                kB1BlockTransferSrcScalarPerVector,
-                kCShuffleBlockTransferScalarPerVector>;
-
-            RunWithDeviceOp<DeviceOpInstance>(param, stream);
-          });
-    }
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
   };
 
   template <typename DeviceOpInstance>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 579841b57..c68a0142a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -5,12 +5,11 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/utility/math.hpp>
 #include <ck/utility/number.hpp>
-#include <ck/utility/sequence.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp"
 
 #include "ck_align_switch.h"
 #include "ck_fmha_common_gemm_constants.h"
@@ -49,6 +48,28 @@ struct grouped_infer_masktype_attnbias_dispatched {
 
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
+#ifndef GROUPED_INFER_HEADDIM_SWITCH
+#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                         \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
+      constexpr ck::index_t kGemm1NPerBlock = 32;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
+      __VA_ARGS__();                                            \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
+      constexpr ck::index_t kGemm1NPerBlock = 64;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
+      __VA_ARGS__();                                            \
+    } else {                                                    \
+      constexpr ck::index_t kGemm1NPerBlock = 128;              \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
+      __VA_ARGS__();                                            \
+    }                                                           \
+  }()
+#endif
+
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -135,69 +156,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     using ck::math::min;
 
-    if (param.K <= 32 && param.Kv <= 32) {
-      constexpr ck::index_t kGemm1NPerBlock = 32;
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedInfer::AK1 /
-          GemmOpConstantsGroupedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedInfer::BK1 /
-          GemmOpConstantsGroupedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsGroupedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      ALIGN_SWITCH_3(
-          kABBlockTransferSrcScalarPerVector_max,
-          kABBlockTransferSrcScalarPerVector,
-          param.K,
-          kB1BlockTransferSrcScalarPerVector_max,
-          kB1BlockTransferSrcScalarPerVector,
-          param.Kv,
-          kCShuffleBlockTransferScalarPerVector_max,
-          kCShuffleBlockTransferScalarPerVector,
-          param.Kv,
-          [&] {
-            using DeviceOpInstance = DeviceOpInstanceTemp<
-                kGemm1NPerBlock,
-                kGemm1NXdlPerWave,
-                kCShuffleNXdlPerWavePerShuffle,
-                kABBlockTransferSrcScalarPerVector,
-                kB1BlockTransferSrcScalarPerVector,
-                kCShuffleBlockTransferScalarPerVector>;
-
-            RunWithDeviceOp<DeviceOpInstance>(param, stream);
-          });
-    } else if (param.K <= 64 && param.Kv <= 64) {
-      constexpr ck::index_t kGemm1NPerBlock = 64;
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
-
+    GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_ak1 =
           GemmOpConstantsGroupedInfer::AK1 /
           GemmOpConstantsGroupedInfer::
@@ -230,86 +189,54 @@ struct grouped_infer_masktype_attnbias_dispatched {
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
           min(2, thread_slice_length_cshuflle_n);
 
-      ALIGN_SWITCH_3(
-          kABBlockTransferSrcScalarPerVector_max,
-          kABBlockTransferSrcScalarPerVector,
-          param.K,
-          kB1BlockTransferSrcScalarPerVector_max,
-          kB1BlockTransferSrcScalarPerVector,
-          param.Kv,
-          kCShuffleBlockTransferScalarPerVector_max,
-          kCShuffleBlockTransferScalarPerVector,
-          param.Kv,
-          [&] {
-            using DeviceOpInstance = DeviceOpInstanceTemp<
-                kGemm1NPerBlock,
-                kGemm1NXdlPerWave,
-                kCShuffleNXdlPerWavePerShuffle,
-                kABBlockTransferSrcScalarPerVector,
-                kB1BlockTransferSrcScalarPerVector,
-                kCShuffleBlockTransferScalarPerVector>;
-
-            RunWithDeviceOp<DeviceOpInstance>(param, stream);
-          });
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedInfer::AK1 /
-          GemmOpConstantsGroupedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedInfer::BK1 /
-          GemmOpConstantsGroupedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsGroupedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      ALIGN_SWITCH_3(
-          kABBlockTransferSrcScalarPerVector_max,
-          kABBlockTransferSrcScalarPerVector,
-          param.K,
-          kB1BlockTransferSrcScalarPerVector_max,
-          kB1BlockTransferSrcScalarPerVector,
-          param.Kv,
-          kCShuffleBlockTransferScalarPerVector_max,
-          kCShuffleBlockTransferScalarPerVector,
-          param.Kv,
-          [&] {
-            using DeviceOpInstance = DeviceOpInstanceTemp<
-                kGemm1NPerBlock,
-                kGemm1NXdlPerWave,
-                kCShuffleNXdlPerWavePerShuffle,
-                kABBlockTransferSrcScalarPerVector,
-                kB1BlockTransferSrcScalarPerVector,
-                kCShuffleBlockTransferScalarPerVector>;
-
-            RunWithDeviceOp<DeviceOpInstance>(param, stream);
-          });
-    };
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
   };
 
   template <typename DeviceOpInstance>

From d5b32ef54735da011aabad6812b6fdddc9278b65 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 27 Oct 2023 12:00:15 +0000
Subject: [PATCH 121/641] Fix in ck_fmha_grouped_forward.h

---
 .../attention/hip_fmha/ck_fmha_grouped_forward.h     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index a61237014..1588f8b41 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -166,12 +166,12 @@ struct grouped_forward_masktype_attnbias_dispatched {
 
     GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedForward::AK1 /
-          GemmOpConstantsBatchedForward::
+          GemmOpConstantsGroupedForward::AK1 /
+          GemmOpConstantsGroupedForward::
               ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
       constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedForward::BK1 /
-          GemmOpConstantsBatchedForward::
+          GemmOpConstantsGroupedForward::BK1 /
+          GemmOpConstantsGroupedForward::
               BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
 
       static_assert(
@@ -182,7 +182,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
           min(2, thread_slice_length_ak1);
 
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedForward::
+          GemmOpConstantsGroupedForward::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
       constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
           min(2, thread_slice_length_gemm1n);
@@ -190,7 +190,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
       constexpr ck::index_t thread_slice_length_cshuflle_n =
           (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
            kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedForward::
+          GemmOpConstantsGroupedForward::
               CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
                   At(I3);
 

From 7892b945d24f0fe22ffaa815124dc3887b3cfd28 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 27 Oct 2023 15:54:57 +0000
Subject: [PATCH 122/641] Codes simplificaton in forward/infer

---
 .../hip_fmha/ck_fmha_batched_forward.h        | 33 ++++++++++---------
 .../hip_fmha/ck_fmha_batched_infer.h          | 33 ++++++++++---------
 .../hip_fmha/ck_fmha_grouped_forward.h        | 33 ++++++++++---------
 .../hip_fmha/ck_fmha_grouped_infer.h          | 33 ++++++++++---------
 4 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 0307d47a5..f9d0dc087 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -179,23 +179,24 @@ struct batched_forward_masktype_attnbias_dispatched {
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     using ck::math::min;
 
-    BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedForward::AK1 /
-          GemmOpConstantsBatchedForward::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedForward::BK1 /
-          GemmOpConstantsBatchedForward::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsBatchedForward::AK1 /
+        GemmOpConstantsBatchedForward::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsBatchedForward::BK1 /
+        GemmOpConstantsBatchedForward::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(2, thread_slice_length_ak1);
 
+    BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsBatchedForward::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 6fddd553c..335a7ca3b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -156,23 +156,24 @@ struct batched_infer_masktype_attnbias_dispatched {
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     using ck::math::min;
 
-    BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedInfer::AK1 /
-          GemmOpConstantsBatchedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedInfer::BK1 /
-          GemmOpConstantsBatchedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_ak1);
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsBatchedInfer::AK1 /
+        GemmOpConstantsBatchedInfer::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsBatchedInfer::BK1 /
+        GemmOpConstantsBatchedInfer::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(4, thread_slice_length_ak1);
 
+    BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsBatchedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 1588f8b41..1ca4c3210 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -164,23 +164,24 @@ struct grouped_forward_masktype_attnbias_dispatched {
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     using ck::math::min;
 
-    GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedForward::AK1 /
-          GemmOpConstantsGroupedForward::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedForward::BK1 /
-          GemmOpConstantsGroupedForward::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsGroupedForward::AK1 /
+        GemmOpConstantsGroupedForward::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsGroupedForward::BK1 /
+        GemmOpConstantsGroupedForward::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(2, thread_slice_length_ak1);
 
+    GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsGroupedForward::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index c68a0142a..5552a3074 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -156,23 +156,24 @@ struct grouped_infer_masktype_attnbias_dispatched {
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     using ck::math::min;
 
-    GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedInfer::AK1 /
-          GemmOpConstantsGroupedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedInfer::BK1 /
-          GemmOpConstantsGroupedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_ak1);
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsGroupedInfer::AK1 /
+        GemmOpConstantsGroupedInfer::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsGroupedInfer::BK1 /
+        GemmOpConstantsGroupedInfer::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(4, thread_slice_length_ak1);
 
+    GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
           GemmOpConstantsGroupedInfer::
               B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);

From c2de2281b0d44aaeefcc378fbb79132ef7ba8853 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 27 Oct 2023 16:52:41 +0000
Subject: [PATCH 123/641] Tiny change to the grouped forward gemm constants

---
 .../csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index ab72b87cf..992a4c4b2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -88,7 +88,7 @@ struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
   static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
   using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
   using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
   static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;

From 8b63dca454abe0e16d5efac201bf8ed9d50ac7a9 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 28 Oct 2023 00:24:12 +0000
Subject: [PATCH 124/641] [Performance] Add A/B0/B1/C scalar_per_vector
 selection in backward

---
 .../ck_fmha_backward_gemm_constants.h         | 186 ++++++-
 .../hip_fmha/ck_fmha_batched_backward.h       | 456 +++++++++++-------
 .../hip_fmha/ck_fmha_grouped_backward.h       | 447 ++++++++++-------
 3 files changed, 764 insertions(+), 325 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
index 585a83e3a..d80ffa43b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
@@ -3,4 +3,188 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_op_helper.h"
 
-struct GemmOpConstantsBackward {};
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsBatchedBackward_V1 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  // static constexpr ck::index_t KPerBlock;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 4;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsBatchedBackward_V2 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 64;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 128;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 64;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 2;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsGroupedBackward_V1 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  // static constexpr ck::index_t KPerBlock;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 4;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
+
+// list the template parameters that will not be tuned,
+// the commented lines gives the tunable template parameters
+struct GemmOpConstantsGroupedBackward_V2 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 64;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 128;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 64;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 2;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 50d0761a6..9fd8e06e0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -5,11 +5,14 @@
 
 #include <ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
 
+#include "ck_align_switch.h"
+#include "ck_fmha_backward_gemm_constants.h"
+#include "ck_fmha_common_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -37,48 +40,49 @@ struct batched_backward_masktype_attnbias_dispatched {
       typename std::conditional<has_attn_bias, scalar_t, void>::type;
   using Acc1BiasDataType = void;
 
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock =
-      MaxVectorSizeForType<scalar_t>::value;
-
   static constexpr auto GemmSpec =
       ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
   static constexpr auto MaskingSpec =
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr auto TensorSpecQ =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecK =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecV =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecY =
-      ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = true;
 
-  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
+#ifndef BATCHED_BACKWARD_V1_HEADDIM_SWITCH
+#define BATCHED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                               \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
+      __VA_ARGS__();                                                  \
+    } else {                                                          \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
+      __VA_ARGS__();                                                  \
+    };                                                                \
+  }()
+#endif
+
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
       ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      typename kCShuffleBlockTransferClusterLengths>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
+      typename kCShuffleBlockTransferClusterLengths,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
+  using DeviceOpInstanceTemp_V1 = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
           InputDataType,
           OutputDataType,
           GemmDataType,
@@ -94,153 +98,279 @@ struct batched_backward_masktype_attnbias_dispatched {
           QKVElementOp,
           YElementOp,
           GemmSpec,
-          TensorSpecQ,
-          TensorSpecK,
-          TensorSpecV,
-          TensorSpecY,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
+          GemmOpConstantsBatchedBackward_V1::NumGemmKPrefetchStage,
+          GemmOpConstantsBatchedBackward_V1::BlockSize,
+          GemmOpConstantsBatchedBackward_V1::MPerBlock,
+          GemmOpConstantsBatchedBackward_V1::NPerBlock,
           kGemm1NPerBlock, // KPerBlock == kGemm1NPerBlock required
           kGemm1NPerBlock,
-          32, // Gemm1KperBlock
-          32, // Gemm2KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          4, // MXdlPerWave
-          1, // NXdlPerWave
+          GemmOpConstantsBatchedBackward_V1::Gemm1KPerBlock,
+          GemmOpConstantsBatchedBackward_V1::Gemm2KPerBlock,
+          GemmOpConstantsBatchedBackward_V1::AK1,
+          GemmOpConstantsBatchedBackward_V1::BK1,
+          GemmOpConstantsBatchedBackward_V1::B1K1,
+          GemmOpConstantsBatchedBackward_V1::MPerXDL,
+          GemmOpConstantsBatchedBackward_V1::NPerXDL,
+          GemmOpConstantsBatchedBackward_V1::MXdlPerWave,
+          GemmOpConstantsBatchedBackward_V1::NXdlPerWave,
           kGemm1NXdlPerWave,
-          1, // Gemm2NXdlPerWave
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-          1, // CShuffleMXdlPerWavePerShuffle
+          GemmOpConstantsBatchedBackward_V1::Gemm2NXdlPerWave,
+          GemmOpConstantsBatchedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsBatchedBackward_V1::ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedBackward_V1::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedBackward_V1::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V1::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsBatchedBackward_V1::ABlockLdsExtraM,
+          GemmOpConstantsBatchedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedBackward_V1::BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedBackward_V1::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedBackward_V1::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V1::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedBackward_V1::BBlockLdsExtraN,
+          kAcc0BiasTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V1::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
           kCShuffleBlockTransferClusterLengths,
-          kCShuffleBlockTransferScalarPerVector, // TUNABLE
+          kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
+  // clang-format on
 
-  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
-    if (param.K <= 32 && param.Kv <= 32) {
-      constexpr ck::index_t kGemm1NPerBlock = 32;
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
-      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;
-
-      using DeviceOpInstance = DeviceOpInstanceTemp<
+  // clang-format off
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      typename kCShuffleBlockTransferClusterLengths,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kB1BlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
+  using DeviceOpInstanceTemp_V2 = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
+          InputDataType,
+          OutputDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          ShuffleDataType,
+          QKVElementOp,
+          QKVElementOp,
+          Scale,
+          QKVElementOp,
+          YElementOp,
+          GemmSpec,
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
+          GemmOpConstantsBatchedBackward_V2::NumGemmKPrefetchStage,
+          GemmOpConstantsBatchedBackward_V2::BlockSize,
+          GemmOpConstantsBatchedBackward_V2::MPerBlock,
+          GemmOpConstantsBatchedBackward_V2::NPerBlock,
+          GemmOpConstantsBatchedBackward_V2::KPerBlock,
           kGemm1NPerBlock,
+          GemmOpConstantsBatchedBackward_V2::Gemm1KPerBlock,
+          GemmOpConstantsBatchedBackward_V2::Gemm2KPerBlock,
+          GemmOpConstantsBatchedBackward_V2::AK1,
+          GemmOpConstantsBatchedBackward_V2::BK1,
+          GemmOpConstantsBatchedBackward_V2::B1K1,
+          GemmOpConstantsBatchedBackward_V2::MPerXDL,
+          GemmOpConstantsBatchedBackward_V2::NPerXDL,
+          GemmOpConstantsBatchedBackward_V2::MXdlPerWave,
+          GemmOpConstantsBatchedBackward_V2::NXdlPerWave,
           kGemm1NXdlPerWave,
+          GemmOpConstantsBatchedBackward_V2::Gemm2NXdlPerWave,
+          GemmOpConstantsBatchedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsBatchedBackward_V2::ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedBackward_V2::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedBackward_V2::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V2::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsBatchedBackward_V2::ABlockLdsExtraM,
+          GemmOpConstantsBatchedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedBackward_V2::BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedBackward_V2::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedBackward_V2::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V2::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedBackward_V2::BBlockLdsExtraN,
+          kAcc0BiasTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V2::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedBackward_V2::B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedBackward_V2::B1BlockTransferSrcAccessOrder,
+          GemmOpConstantsBatchedBackward_V2::B1BlockTransferSrcVectorDim,
+          kB1BlockTransferSrcScalarPerVector,
+          GemmOpConstantsBatchedBackward_V2::B1BlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsBatchedBackward_V2::B1BlockLdsExtraN,
+          GemmOpConstantsBatchedBackward_V2::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else if (param.K <= 64 && param.Kv <= 64) {
-      constexpr ck::index_t kGemm1NPerBlock = 64;
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+          kCShuffleBlockTransferClusterLengths,
+          kCShuffleBlockTransferScalarPerVector,
+          MaskingSpec,
+          Deterministic>;
+  // clang-format on
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths>;
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
 
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    if (param.K <= 64 && param.Kv <= 64) {
+      // compile-time constants which don't depend on head-dim switching
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedBackward_V1::AK1 /
+          GemmOpConstantsBatchedBackward_V1::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedBackward_V1::BK1 /
+          GemmOpConstantsBatchedBackward_V1::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      BATCHED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+        constexpr ck::index_t thread_slice_length_cshuflle_n =
+            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+             kGemm1NXdlPerWave) /
+            kCShuffleBlockTransferClusterLengths::At(I3);
+
+        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+            min(2, thread_slice_length_cshuflle_n);
+
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      });
     } else {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              InputDataType,
-              OutputDataType,
-              GemmDataType,
-              ZDataType,
-              LSEDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              ShuffleDataType,
-              QKVElementOp,
-              QKVElementOp,
-              Scale,
-              QKVElementOp,
-              YElementOp,
-              GemmSpec,
-              TensorSpecQ,
-              TensorSpecK,
-              TensorSpecV,
-              TensorSpecY,
-              1,
-              256,
-              64, // MPerBlock
-              128, // NPerBlock
-              128, // KPerBlock
-              128, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              64, // Gemm2KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // A1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              2, // MXdlPerWave
-              1, // NXdlPerWave
-              4, // Gemm1NXdlPerWave
-              1, // Gemm2NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // B0BlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-              S<8, 32, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              kB1BlockTransferSrcScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              4, // CShuffleNXdlPerWavePerShuffle
-              S<1,
-                32,
-                1,
-                8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-              kCShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec,
-              Deterministic>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedBackward_V2::AK1 /
+          GemmOpConstantsBatchedBackward_V2::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedBackward_V2::BK1 /
+          GemmOpConstantsBatchedBackward_V2::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsBatchedBackward_V2::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          kCShuffleBlockTransferClusterLengths::At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(2, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+
+              static_assert(
+                  kB1BlockTransferSrcScalarPerVector > 0,
+                  "kB1BlockTransferSrcScalarPerVector must be positive");
+
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+
+              static_assert(
+                  kB1BlockTransferSrcScalarPerVector > 0,
+                  "kB1BlockTransferSrcScalarPerVector must be positive");
+
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
     };
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 0de98ed0c..3301fc2b6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -5,12 +5,16 @@
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/sequence.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp"
+#include <ck/utility/math.hpp>
+#include <ck/utility/number.hpp>
 
+#include "ck_align_switch.h"
+#include "ck_fmha_backward_gemm_constants.h"
+#include "ck_fmha_common_gemm_constants.h"
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
@@ -38,48 +42,49 @@ struct grouped_backward_masktype_attnbias_dispatched {
       typename std::conditional<has_attn_bias, scalar_t, void>::type;
   using Acc1BiasDataType = void;
 
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock =
-      MaxVectorSizeForType<scalar_t>::value;
-
   static constexpr auto GemmSpec =
       ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
   static constexpr auto MaskingSpec =
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr auto TensorSpecQ =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecK =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecV =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecY =
-      ck::tensor_operation::device::TensorSpecialization::Default;
   static constexpr bool Deterministic = true;
 
-  static constexpr ck::index_t kABBlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kB1BlockTransferSrcScalarPerVector = 1;
-  static constexpr ck::index_t kCShuffleBlockTransferScalarPerVector = 1;
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
+#ifndef GROUPED_BACKWARD_V1_HEADDIM_SWITCH
+#define GROUPED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                               \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
+      __VA_ARGS__();                                                  \
+    } else {                                                          \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
+      __VA_ARGS__();                                                  \
+    };                                                                \
+  }()
+#endif
+
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
       ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      typename kCShuffleBlockTransferClusterLengths>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-          NumDimG,
-          NumDimM,
-          NumDimN,
-          NumDimK,
-          NumDimO,
+      typename kCShuffleBlockTransferClusterLengths,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
+  using DeviceOpInstanceTemp_V1 = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
           InputDataType,
           OutputDataType,
           GemmDataType,
@@ -95,150 +100,270 @@ struct grouped_backward_masktype_attnbias_dispatched {
           QKVElementOp,
           YElementOp,
           GemmSpec,
-          TensorSpecQ,
-          TensorSpecK,
-          TensorSpecV,
-          TensorSpecY,
-          1,
-          256,
-          128, // MPerBlock
-          128, // NPerBlock
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
+          GemmOpConstantsGroupedBackward_V1::NumGemmKPrefetchStage,
+          GemmOpConstantsGroupedBackward_V1::BlockSize,
+          GemmOpConstantsGroupedBackward_V1::MPerBlock,
+          GemmOpConstantsGroupedBackward_V1::NPerBlock,
           kGemm1NPerBlock, // KPerBlock = kGemm1NerBlock
           kGemm1NPerBlock,
-          32, // Gemm1KPerBlock
-          32, // Gemm2KPerBlock
-          8, // AK1
-          8, // BK1
-          2, // B1K1
-          32, // MPerXDL
-          32, // NPerXDL
-          4, // MXdlPerWave
-          1, // NXdlPerWave
+          GemmOpConstantsGroupedBackward_V1::Gemm1KPerBlock,
+          GemmOpConstantsGroupedBackward_V1::Gemm2KPerBlock,
+          GemmOpConstantsGroupedBackward_V1::AK1,
+          GemmOpConstantsGroupedBackward_V1::BK1,
+          GemmOpConstantsGroupedBackward_V1::B1K1,
+          GemmOpConstantsGroupedBackward_V1::MPerXDL,
+          GemmOpConstantsGroupedBackward_V1::NPerXDL,
+          GemmOpConstantsGroupedBackward_V1::MXdlPerWave,
+          GemmOpConstantsGroupedBackward_V1::NXdlPerWave,
           kGemm1NXdlPerWave,
-          1, // Gemm2NXdlPerWave
-          S<4, 64, 1>, // ABlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          S<4, 64, 1>, // BBlockTransfer
-          S<1, 0, 2>,
-          S<1, 0, 2>,
-          2,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
-          8,
-          true,
-          kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-          1, // CShuffleMXdlPerWavePerShuffle
+          GemmOpConstantsGroupedBackward_V1::Gemm2NXdlPerWave,
+          GemmOpConstantsGroupedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsGroupedBackward_V1::ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedBackward_V1::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedBackward_V1::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V1::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsGroupedBackward_V1::ABlockLdsExtraM,
+          GemmOpConstantsGroupedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedBackward_V1::BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedBackward_V1::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedBackward_V1::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V1::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedBackward_V1::BBlockLdsExtraN,
+          kAcc0BiasTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V2::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
           kCShuffleBlockTransferClusterLengths,
-          kCShuffleBlockTransferScalarPerVector, // TUNABLE
+          kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
+  // clang-format on
 
-  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
-    if (param.K <= 32 && param.Kv <= 32) {
-      constexpr ck::index_t kGemm1NPerBlock = 32;
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;
-      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;
-
-      using DeviceOpInstance = DeviceOpInstanceTemp<
+  // clang-format off
+  template <
+      ck::index_t kGemm1NPerBlock,
+      ck::index_t kGemm1NXdlPerWave,
+      ck::index_t kCShuffleNXdlPerWavePerShuffle,
+      typename kCShuffleBlockTransferClusterLengths,
+      ck::index_t kABBlockTransferSrcScalarPerVector,
+      ck::index_t kB1BlockTransferSrcScalarPerVector,
+      ck::index_t kCShuffleBlockTransferScalarPerVector>
+  using DeviceOpInstanceTemp_V2 = ck::tensor_operation::device::
+      DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+          GemmOpConstantsCommon::NumDimG,
+          GemmOpConstantsCommon::NumDimM,
+          GemmOpConstantsCommon::NumDimN,
+          GemmOpConstantsCommon::NumDimK,
+          GemmOpConstantsCommon::NumDimO,
+          InputDataType,
+          OutputDataType,
+          GemmDataType,
+          ZDataType,
+          LSEDataType,
+          Acc0BiasDataType,
+          Acc1BiasDataType,
+          AccDataType,
+          ShuffleDataType,
+          QKVElementOp,
+          QKVElementOp,
+          Scale,
+          QKVElementOp,
+          YElementOp,
+          GemmSpec,
+          GemmOpConstantsCommon::TensorSpecA,
+          GemmOpConstantsCommon::TensorSpecB0,
+          GemmOpConstantsCommon::TensorSpecB1,
+          GemmOpConstantsCommon::TensorSpecC,
+          GemmOpConstantsGroupedBackward_V2::NumGemmKPrefetchStage,
+          GemmOpConstantsGroupedBackward_V2::BlockSize,
+          GemmOpConstantsGroupedBackward_V2::MPerBlock,
+          GemmOpConstantsGroupedBackward_V2::NPerBlock,
+          GemmOpConstantsGroupedBackward_V2::KPerBlock,
           kGemm1NPerBlock,
+          GemmOpConstantsGroupedBackward_V2::Gemm1KPerBlock,
+          GemmOpConstantsGroupedBackward_V2::Gemm2KPerBlock,
+          GemmOpConstantsGroupedBackward_V2::AK1,
+          GemmOpConstantsGroupedBackward_V2::BK1,
+          GemmOpConstantsGroupedBackward_V2::B1K1,
+          GemmOpConstantsGroupedBackward_V2::MPerXDL,
+          GemmOpConstantsGroupedBackward_V2::NPerXDL,
+          GemmOpConstantsGroupedBackward_V2::MXdlPerWave,
+          GemmOpConstantsGroupedBackward_V2::NXdlPerWave,
           kGemm1NXdlPerWave,
+          GemmOpConstantsBatchedBackward_V2::Gemm2NXdlPerWave,
+          GemmOpConstantsGroupedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsGroupedBackward_V2::ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedBackward_V2::ABlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedBackward_V2::ABlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V2::ABlockTransferDstScalarPerVector_AK1,
+          GemmOpConstantsGroupedBackward_V2::ABlockLdsExtraM,
+          GemmOpConstantsGroupedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedBackward_V2::BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedBackward_V2::BBlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedBackward_V2::BBlockTransferSrcVectorDim,
+          kABBlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V2::BBlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedBackward_V2::BBlockLdsExtraN,
+          kAcc0BiasTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V2::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedBackward_V2::B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedBackward_V2::B1BlockTransferSrcAccessOrder,
+          GemmOpConstantsGroupedBackward_V2::B1BlockTransferSrcVectorDim,
+          kB1BlockTransferSrcScalarPerVector,
+          GemmOpConstantsGroupedBackward_V2::B1BlockTransferDstScalarPerVector_BK1,
+          GemmOpConstantsGroupedBackward_V2::B1BlockLdsExtraN,
+          GemmOpConstantsGroupedBackward_V2::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
-    } else if (param.K <= 64 && param.Kv <= 64) {
-      constexpr ck::index_t kGemm1NPerBlock = 64;
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+          kCShuffleBlockTransferClusterLengths,
+          kCShuffleBlockTransferScalarPerVector,
+          MaskingSpec,
+          Deterministic>;
+  // clang-format on
 
-      using DeviceOpInstance = DeviceOpInstanceTemp<
-          kGemm1NPerBlock,
-          kGemm1NXdlPerWave,
-          kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths>;
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
 
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    if (param.K <= 64 && param.Kv <= 64) {
+      // compile-time constants which don't depend on head-dim switching
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsGroupedBackward_V1::AK1 /
+          GemmOpConstantsGroupedBackward_V1::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsGroupedBackward_V1::BK1 /
+          GemmOpConstantsGroupedBackward_V1::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      GROUPED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+        constexpr ck::index_t thread_slice_length_cshuflle_n =
+            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+             kGemm1NXdlPerWave) /
+            kCShuffleBlockTransferClusterLengths::At(I3);
+
+        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+            min(2, thread_slice_length_cshuflle_n);
+
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      });
     } else {
-      using DeviceOpInstance = ck::tensor_operation::device::
-          DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-              NumDimG,
-              NumDimM,
-              NumDimN,
-              NumDimK,
-              NumDimO,
-              InputDataType,
-              OutputDataType,
-              GemmDataType,
-              ZDataType,
-              LSEDataType,
-              Acc0BiasDataType,
-              Acc1BiasDataType,
-              AccDataType,
-              ShuffleDataType,
-              QKVElementOp,
-              QKVElementOp,
-              Scale,
-              QKVElementOp,
-              YElementOp,
-              GemmSpec,
-              TensorSpecQ,
-              TensorSpecK,
-              TensorSpecV,
-              TensorSpecY,
-              1,
-              256,
-              64, // MPerBlock
-              128, // NPerBlock
-              128, // KPerBlock
-              128, // Gemm1NPerBlock
-              32, // Gemm1KPerBlock
-              64, // Gemm2KPerBlock
-              8, // AK1
-              8, // BK1
-              2, // B1K1
-              32, // MPerXDL
-              32, // NPerXDL
-              2, // MXdlPerWave
-              1, // NXdlPerWave
-              4, // Gemm1NXdlPerWave
-              1, // Gemm2NXdlPerWave
-              S<4, 64, 1>, // ABlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              S<4, 64, 1>, // B0BlockTransfer
-              S<1, 0, 2>,
-              S<1, 0, 2>,
-              2,
-              kABBlockTransferSrcScalarPerVector, // TUNABLE
-              8,
-              true,
-              kAcc0BiasTransferSrcScalarPerVector, // TUNABLE
-              S<8, 32, 1>, // B1BlockTransfer
-              S<0, 2, 1>,
-              S<0, 2, 1>,
-              1,
-              kB1BlockTransferSrcScalarPerVector, // TUNABLE
-              2,
-              false,
-              1, // CShuffleMXdlPerWavePerShuffle
-              4, // CShuffleNXdlPerWavePerShuffle
-              S<1, 32, 1, 8>,
-              kCShuffleBlockTransferScalarPerVector, // TUNABLE
-              MaskingSpec,
-              Deterministic>;
-
-      RunWithDeviceOp<DeviceOpInstance>(param, stream);
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsGroupedBackward_V2::AK1 /
+          GemmOpConstantsGroupedBackward_V2::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsGroupedBackward_V2::BK1 /
+          GemmOpConstantsGroupedBackward_V2::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsGroupedBackward_V2::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          kCShuffleBlockTransferClusterLengths::At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(2, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
     };
   };
 

From ad617a5b8d08348beb76a815ab2c8cac9d6ff33c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 28 Oct 2023 00:36:27 +0000
Subject: [PATCH 125/641] Add clang-format off to better show the device-op
 template instance definition

---
 .../hip_fmha/ck_fmha_batched_forward.h        | 26 +++++++------------
 .../hip_fmha/ck_fmha_batched_infer.h          | 17 +++++-------
 .../hip_fmha/ck_fmha_grouped_backward.h       |  3 +--
 .../hip_fmha/ck_fmha_grouped_forward.h        | 26 +++++++------------
 .../hip_fmha/ck_fmha_grouped_infer.h          | 19 ++++++--------
 5 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index f9d0dc087..34f748aa7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -85,6 +85,7 @@ struct batched_forward_masktype_attnbias_dispatched {
   }()
 #endif
 
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -92,8 +93,7 @@ struct batched_forward_masktype_attnbias_dispatched {
       ck::index_t kABBlockTransferSrcScalarPerVector,
       ck::index_t kB1BlockTransferSrcScalarPerVector,
       ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
           GemmOpConstantsCommon::NumDimG,
           GemmOpConstantsCommon::NumDimM,
           GemmOpConstantsCommon::NumDimN,
@@ -136,29 +136,23 @@ struct batched_forward_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedForward::NXdlPerWave,
           kGemm1NXdlPerWave,
           GemmOpConstantsBatchedForward::DropoutStep,
-          GemmOpConstantsBatchedForward::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsBatchedForward::
-              ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsBatchedForward::ABlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsBatchedForward::ABlockTransferSrcAccessOrder,
           GemmOpConstantsBatchedForward::ABlockTransferSrcVectorDim,
           kABBlockTransferSrcScalarPerVector,
           GemmOpConstantsBatchedForward::ABlockTransferDstScalarPerVector_AK1,
           GemmOpConstantsBatchedForward::ABlockLdsExtraM,
-          GemmOpConstantsBatchedForward::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedForward::
-              BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedForward::BBlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsBatchedForward::BBlockTransferSrcAccessOrder,
           GemmOpConstantsBatchedForward::BBlockTransferSrcVectorDim,
           kABBlockTransferSrcScalarPerVector,
           GemmOpConstantsBatchedForward::BBlockTransferDstScalarPerVector_BK1,
           GemmOpConstantsBatchedForward::BBlockLdsExtraN,
           kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedForward::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedForward::
-              B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsBatchedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedForward::B1BlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsBatchedForward::B1BlockTransferSrcAccessOrder,
           GemmOpConstantsBatchedForward::B1BlockTransferSrcVectorDim,
           kB1BlockTransferSrcScalarPerVector,
@@ -166,11 +160,11 @@ struct batched_forward_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedForward::B1BlockLdsExtraN,
           GemmOpConstantsBatchedForward::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsBatchedForward::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          GemmOpConstantsBatchedForward::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           GemmOpConstantsBatchedForward::Acc1BiasTransferSrcScalarPerVector,
           MaskingSpec>;
+  // clang-format on
 
   static constexpr auto I1 = ck::Number<1>{};
   static constexpr auto I2 = ck::Number<2>{};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 335a7ca3b..b3a6bd0c4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -70,6 +70,7 @@ struct batched_infer_masktype_attnbias_dispatched {
   }()
 #endif
 
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -77,8 +78,7 @@ struct batched_infer_masktype_attnbias_dispatched {
       ck::index_t kABBlockTransferSrcScalarPerVector,
       ck::index_t kB1BlockTransferSrcScalarPerVector,
       ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::
-      DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
           GemmOpConstantsCommon::NumDimG,
           GemmOpConstantsCommon::NumDimM,
           GemmOpConstantsCommon::NumDimN,
@@ -117,16 +117,14 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::MXdlPerWave,
           GemmOpConstantsBatchedInfer::NXdlPerWave,
           kGemm1NXdlPerWave,
-          GemmOpConstantsBatchedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1,
           GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsBatchedInfer::ABlockTransferSrcAccessOrder,
           GemmOpConstantsBatchedInfer::ABlockTransferSrcVectorDim,
           kABBlockTransferSrcScalarPerVector,
           GemmOpConstantsBatchedInfer::ABlockTransferDstScalarPerVector_AK1,
           GemmOpConstantsBatchedInfer::ABlockLdsExtraM,
-          GemmOpConstantsBatchedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1,
           GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsBatchedInfer::BBlockTransferSrcAccessOrder,
           GemmOpConstantsBatchedInfer::BBlockTransferSrcVectorDim,
@@ -134,8 +132,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::BBlockTransferDstScalarPerVector_BK1,
           GemmOpConstantsBatchedInfer::BBlockLdsExtraN,
           kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
           GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsBatchedInfer::B1BlockTransferSrcAccessOrder,
           GemmOpConstantsBatchedInfer::B1BlockTransferSrcVectorDim,
@@ -144,10 +141,10 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::B1BlockLdsExtraN,
           GemmOpConstantsBatchedInfer::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsBatchedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          GemmOpConstantsBatchedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
+  // clang-format on
 
   static constexpr auto I1 = ck::Number<1>{};
   static constexpr auto I2 = ck::Number<2>{};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 3301fc2b6..85f97931f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -153,8 +153,7 @@ struct grouped_backward_masktype_attnbias_dispatched {
       ck::index_t kABBlockTransferSrcScalarPerVector,
       ck::index_t kB1BlockTransferSrcScalarPerVector,
       ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp_V2 = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
+  using DeviceOpInstanceTemp_V2 = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
           GemmOpConstantsCommon::NumDimG,
           GemmOpConstantsCommon::NumDimM,
           GemmOpConstantsCommon::NumDimN,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 1ca4c3210..9f22b7e28 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -70,6 +70,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
   }()
 #endif
 
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -77,8 +78,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
       ck::index_t kABBlockTransferSrcScalarPerVector,
       ck::index_t kB1BlockTransferSrcScalarPerVector,
       ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
           GemmOpConstantsCommon::NumDimG,
           GemmOpConstantsCommon::NumDimM,
           GemmOpConstantsCommon::NumDimN,
@@ -121,29 +121,23 @@ struct grouped_forward_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedForward::NXdlPerWave,
           kGemm1NXdlPerWave,
           GemmOpConstantsGroupedForward::DropoutStep,
-          GemmOpConstantsGroupedForward::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsGroupedForward::
-              ABlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsGroupedForward::ABlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsGroupedForward::ABlockTransferSrcAccessOrder,
           GemmOpConstantsGroupedForward::ABlockTransferSrcVectorDim,
           kABBlockTransferSrcScalarPerVector,
           GemmOpConstantsGroupedForward::ABlockTransferDstScalarPerVector_AK1,
           GemmOpConstantsGroupedForward::ABlockLdsExtraM,
-          GemmOpConstantsGroupedForward::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedForward::
-              BBlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedForward::BBlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsGroupedForward::BBlockTransferSrcAccessOrder,
           GemmOpConstantsGroupedForward::BBlockTransferSrcVectorDim,
           kABBlockTransferSrcScalarPerVector,
           GemmOpConstantsGroupedForward::BBlockTransferDstScalarPerVector_BK1,
           GemmOpConstantsGroupedForward::BBlockLdsExtraN,
           kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedForward::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedForward::
-              B1BlockTransferThreadClusterArrangeOrder,
+          GemmOpConstantsGroupedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedForward::B1BlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsGroupedForward::B1BlockTransferSrcAccessOrder,
           GemmOpConstantsGroupedForward::B1BlockTransferSrcVectorDim,
           kB1BlockTransferSrcScalarPerVector,
@@ -151,11 +145,11 @@ struct grouped_forward_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedForward::B1BlockLdsExtraN,
           GemmOpConstantsGroupedForward::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsGroupedForward::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          GemmOpConstantsGroupedForward::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           GemmOpConstantsGroupedForward::Acc1BiasTransferSrcScalarPerVector,
           MaskingSpec>;
+  // clang-format on
 
   static constexpr auto I1 = ck::Number<1>{};
   static constexpr auto I2 = ck::Number<2>{};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 5552a3074..775ff94b5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -70,6 +70,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
   }()
 #endif
 
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -77,8 +78,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
       ck::index_t kABBlockTransferSrcScalarPerVector,
       ck::index_t kB1BlockTransferSrcScalarPerVector,
       ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::
-      DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
+  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
           GemmOpConstantsCommon::NumDimG,
           GemmOpConstantsCommon::NumDimM,
           GemmOpConstantsCommon::NumDimN,
@@ -117,16 +117,14 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::MXdlPerWave,
           GemmOpConstantsGroupedInfer::NXdlPerWave,
           kGemm1NXdlPerWave,
-          GemmOpConstantsGroupedInfer::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1,
           GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsGroupedInfer::ABlockTransferSrcAccessOrder,
           GemmOpConstantsGroupedInfer::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector, // TUNABLE
+          kABBlockTransferSrcScalarPerVector, 
           GemmOpConstantsGroupedInfer::ABlockTransferDstScalarPerVector_AK1,
           GemmOpConstantsGroupedInfer::ABlockLdsExtraM,
-          GemmOpConstantsGroupedInfer::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1,
           GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsGroupedInfer::BBlockTransferSrcAccessOrder,
           GemmOpConstantsGroupedInfer::BBlockTransferSrcVectorDim,
@@ -134,8 +132,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::BBlockTransferDstScalarPerVector_BK1,
           GemmOpConstantsGroupedInfer::BBlockLdsExtraN,
           kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
           GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterArrangeOrder,
           GemmOpConstantsGroupedInfer::B1BlockTransferSrcAccessOrder,
           GemmOpConstantsGroupedInfer::B1BlockTransferSrcVectorDim,
@@ -144,10 +141,10 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::B1BlockLdsExtraN,
           GemmOpConstantsGroupedInfer::CShuffleMXdlPerWavePerShuffle,
           kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsGroupedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          GemmOpConstantsGroupedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
+  // clang-format on
 
   static constexpr auto I1 = ck::Number<1>{};
   static constexpr auto I2 = ck::Number<2>{};

From e9c7919a13a41c2018c247a90e12d9d4b77d9221 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 28 Oct 2023 15:56:51 +0000
Subject: [PATCH 126/641] Tiny change in gemm constants for infer

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h        | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h        | 2 +-
 xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index b3a6bd0c4..639d333c5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -102,7 +102,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsCommon::TensorSpecB0,
           GemmOpConstantsCommon::TensorSpecB1,
           GemmOpConstantsCommon::TensorSpecC,
-          1,
+          GemmOpConstantsBatchedInfer::NumGemmKPrefetchStage,
           GemmOpConstantsBatchedInfer::BlockSize,
           GemmOpConstantsBatchedInfer::MPerBlock,
           GemmOpConstantsBatchedInfer::NPerBlock,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 775ff94b5..dba421a7b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -102,7 +102,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsCommon::TensorSpecB0,
           GemmOpConstantsCommon::TensorSpecB1,
           GemmOpConstantsCommon::TensorSpecC,
-          1,
+          GemmOpConstantsBatchedInfer::NumGemmKPrefetchStage,
           GemmOpConstantsGroupedInfer::BlockSize,
           GemmOpConstantsGroupedInfer::MPerBlock,
           GemmOpConstantsGroupedInfer::NPerBlock,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
index ae66edc1c..b80dc9412 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -6,6 +6,7 @@
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
 struct GemmOpConstantsBatchedInfer {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
   static constexpr ck::index_t BlockSize = 256;
   static constexpr ck::index_t MPerBlock = 128;
   static constexpr ck::index_t NPerBlock = 128;
@@ -53,6 +54,7 @@ struct GemmOpConstantsBatchedInfer {
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
 struct GemmOpConstantsGroupedInfer {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
   static constexpr ck::index_t BlockSize = 256;
   static constexpr ck::index_t MPerBlock = 128;
   static constexpr ck::index_t NPerBlock = 128;

From 33d5e39645c8a89e14a23d8d45cf0abbcbeafd4a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 29 Oct 2023 14:30:00 +0000
Subject: [PATCH 127/641] Add support for mulit-query attention and group-query
 attention

---
 third_party/composable_kernel                 |   2 +-
 .../hip_fmha/attention_backward_generic.cpp   | 112 ++++++++++++++----
 .../hip_fmha/attention_forward_generic.cpp    |  35 +++---
 .../hip_fmha/ck_fmha_batched_backward.h       |  41 ++++---
 .../hip_fmha/ck_fmha_batched_forward.h        |  13 +-
 .../hip_fmha/ck_fmha_batched_infer.h          |  13 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       |  41 ++++---
 .../hip_fmha/ck_fmha_grouped_forward.h        |  15 +--
 .../hip_fmha/ck_fmha_grouped_infer.h          |  13 +-
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  20 +++-
 10 files changed, 201 insertions(+), 104 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 4033f5df2..339b86e96 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 4033f5df2de7a3e778fced14041304d6fc20d673
+Subproject commit 339b86e9682120d8aaa415203545a3cfadbbb142
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 1d28afd8c..c513664f2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -73,8 +73,8 @@ efficient_attention_backward_ck(
   TORCH_CHECK(query.size(1) == grad_out.size(1));
 
   // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
   TORCH_CHECK(query.size(2) == grad_out.size(2));
 
   // Embedding per head
@@ -122,7 +122,8 @@ efficient_attention_backward_ck(
   int64_t B = query.size(0);
   int64_t M = query.size(1);
   int64_t N = key.size(1);
-  int64_t num_heads = query.size(2);
+  int64_t Hq = query.size(2);
+  int64_t Hkv = key.size(2);
   int64_t K = query.size(3);
   int64_t Kv = value.size(3);
 
@@ -131,6 +132,7 @@ efficient_attention_backward_ck(
   at::Tensor grad_q, grad_k, grad_v, grad_bias;
 
   if (query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
+      query.size(2) == key.size(2) &&
       query.storage().is_alias_of(key.storage()) &&
       query.storage().is_alias_of(value.storage())) {
     // Create one big contiguous chunk for grad_q, grad_k, grad_v
@@ -140,9 +142,9 @@ efficient_attention_backward_ck(
     // a `torch.cat` call in the backward pass
     at::Tensor chunk;
     if (use_fp32_qkv_grad)
-      chunk = at::empty({B, M, 3, num_heads, K}, opts.dtype(at::kFloat));
+      chunk = at::empty({B, M, 3, Hq, K}, opts.dtype(at::kFloat));
     else
-      chunk = at::empty({B, M, 3, num_heads, K}, opts);
+      chunk = at::empty({B, M, 3, Hq, K}, opts);
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
@@ -157,9 +159,9 @@ efficient_attention_backward_ck(
     // a `torch.cat` call in the backward pass
     at::Tensor chunk;
     if (use_fp32_qkv_grad)
-      chunk = at::empty({B, N, 2, num_heads, Kv}, opts.dtype(at::kFloat));
+      chunk = at::empty({B, N, 2, Hkv, Kv}, opts.dtype(at::kFloat));
     else
-      chunk = at::empty({B, N, 2, num_heads, Kv}, opts);
+      chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
     grad_k = chunk.select(2, 0);
     grad_v = chunk.select(2, 1);
 
@@ -204,18 +206,36 @@ efficient_attention_backward_ck(
     grad_bias =
         at::empty_strided(bias->sizes(), bias->strides(), bias->options());
 
+  bool is_mqa_gqa = (Hq > Hkv);
+
+  at::Tensor tmp_grad_k, tmp_grad_v;
+
+  if (is_mqa_gqa) {
+    // allocate tmp_grad_k/tmp_grad_v which will be reduce to
+    // grad_k/grad_v for returning
+    if (use_fp32_qkv_grad) {
+      tmp_grad_k = at::empty({B, N, Hq, K}, opts.dtype(at::kFloat));
+      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts.dtype(at::kFloat));
+    } else {
+      tmp_grad_k = at::empty({B, N, Hq, K}, opts);
+      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
+    }
+  }
+
   auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
     p.B = B;
     p.M = M;
     p.N = N;
-    p.num_heads = num_heads;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
     p.K = K;
     p.Kv = Kv;
 
     p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+    p.is_mqa_gqa = is_mqa_gqa;
 
     TORCH_CHECK(p.B == logsumexp.size(0));
-    TORCH_CHECK(p.num_heads == logsumexp.size(1));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
     TORCH_CHECK(p.M == logsumexp.size(2));
 
     if (scale.has_value()) {
@@ -231,8 +251,8 @@ efficient_attention_backward_ck(
     p.out_ptr = out.data_ptr();
 
     p.grad_q_ptr = grad_q.data_ptr();
-    p.grad_k_ptr = grad_k.data_ptr();
-    p.grad_v_ptr = grad_v.data_ptr();
+    p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
+    p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
 
     p.q_strides = {
         static_cast<int>(query.stride(0)),
@@ -255,6 +275,19 @@ efficient_attention_backward_ck(
         static_cast<int>(out.stride(2)),
         static_cast<int>(out.stride(3))};
 
+    if (is_mqa_gqa) {
+      p.tmp_grad_k_strides = {
+          static_cast<int>(tmp_grad_k.stride(0)),
+          static_cast<int>(tmp_grad_k.stride(1)),
+          static_cast<int>(tmp_grad_k.stride(2)),
+          static_cast<int>(tmp_grad_k.stride(3))};
+      p.tmp_grad_v_strides = {
+          static_cast<int>(tmp_grad_v.stride(0)),
+          static_cast<int>(tmp_grad_v.stride(1)),
+          static_cast<int>(tmp_grad_v.stride(2)),
+          static_cast<int>(tmp_grad_v.stride(3))};
+    }
+
     if (bias.has_value()) {
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
@@ -262,8 +295,7 @@ efficient_attention_backward_ck(
       p.has_attn_bias = true;
       p.attn_bias_ptr = bias->data_ptr();
 
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
 
       p.attn_bias_strides = {
           static_cast<int>(bias_4d_view.stride(0)),
@@ -294,16 +326,18 @@ efficient_attention_backward_ck(
     p.num_batches = seqstart_q->size(0) - 1;
     p.M = M;
     p.N = N;
-    p.num_heads = num_heads;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
     p.K = K;
     p.Kv = Kv;
 
     p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+    p.is_mqa_gqa = is_mqa_gqa;
 
     p.max_seqlen_q = *max_seqlen_q_;
 
     TORCH_CHECK(p.num_batches == logsumexp.size(0));
-    TORCH_CHECK(p.num_heads == logsumexp.size(1));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
     TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
 
     if (scale.has_value()) {
@@ -329,13 +363,23 @@ efficient_attention_backward_ck(
         static_cast<int>(out.stride(2)),
         static_cast<int>(out.stride(3))};
 
+    if (is_mqa_gqa) {
+      p.tmp_grad_k_strides = {
+          static_cast<int>(tmp_grad_k.stride(1)),
+          static_cast<int>(tmp_grad_k.stride(2)),
+          static_cast<int>(tmp_grad_k.stride(3))};
+      p.tmp_grad_v_strides = {
+          static_cast<int>(tmp_grad_v.stride(1)),
+          static_cast<int>(tmp_grad_v.stride(2)),
+          static_cast<int>(tmp_grad_v.stride(3))};
+    };
+
     if (bias.has_value()) {
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
       p.has_attn_bias = true;
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
       p.attn_bias_strides = {
           static_cast<int>(bias_4d_view.stride(0)),
           static_cast<int>(bias_4d_view.stride(1)),
@@ -388,8 +432,12 @@ efficient_attention_backward_ck(
     char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
     char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
-    char* grad_k_ptr = reinterpret_cast<char*>(grad_k.data_ptr());
-    char* grad_v_ptr = reinterpret_cast<char*>(grad_v.data_ptr());
+    char* grad_k_ptr = is_mqa_gqa
+        ? reinterpret_cast<char*>(tmp_grad_k.data_ptr())
+        : reinterpret_cast<char*>(grad_k.data_ptr());
+    char* grad_v_ptr = is_mqa_gqa
+        ? reinterpret_cast<char*>(tmp_grad_v.data_ptr())
+        : reinterpret_cast<char*>(grad_v.data_ptr());
     char* grad_bias_ptr = bias_requires_grad
         ? reinterpret_cast<char*>(grad_bias.data_ptr())
         : nullptr;
@@ -416,20 +464,33 @@ efficient_attention_backward_ck(
           static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
           out.scalar_type());
       size_t tmp_logsumexp_offset = get_size_in_bytes(
-          static_cast<size_t>(i) * p.num_heads * p.max_seqlen_q,
+          static_cast<size_t>(i) * p.Hq * p.max_seqlen_q,
           logsumexp.scalar_type());
 
+      size_t tmp_grad_k_offset = is_mqa_gqa
+          ? get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.tmp_grad_k_strides[0],
+                tmp_grad_k.scalar_type())
+          : tmp_k_offset;
+      size_t tmp_grad_v_offset = is_mqa_gqa
+          ? get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.tmp_grad_v_strides[0],
+                tmp_grad_v.scalar_type())
+          : tmp_v_offset;
+
       p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
       p.grad_q_ptrs.push_back(
           reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
 
       p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
       p.grad_k_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_k_ptr[tmp_k_offset * multiplier]));
+          reinterpret_cast<void*>(&grad_k_ptr[tmp_grad_k_offset * multiplier]));
 
       p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
       p.grad_v_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_v_ptr[tmp_v_offset * multiplier]));
+          reinterpret_cast<void*>(&grad_v_ptr[tmp_grad_v_offset * multiplier]));
 
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
       p.grad_out_ptrs.push_back(
@@ -485,6 +546,13 @@ efficient_attention_backward_ck(
       throw std::runtime_error("input data-type is not supported");
   }
 
+  if (is_mqa_gqa) {
+    auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
+    auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
+    grad_k = tmp_grad_k_view.sum(3);
+    grad_v = tmp_grad_v_view.sum(3);
+  }
+
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 #endif
 } // namespace
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index ecd50db2e..aaafa1b3b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -44,10 +44,10 @@ namespace {
 */
 std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
 efficient_attention_forward_ck(
-    const at::Tensor& query, // [b, seqlen, num_heads, K]
-    const at::Tensor& key, // [b, seqlen, num_heads, K]
-    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
-    const c10::optional<at::Tensor>& bias, // [b, num_heads, seqlen, seqlen]
+    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
+    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
+    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
+    const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
     const c10::optional<at::Tensor>& seqstart_q,
@@ -73,8 +73,8 @@ efficient_attention_forward_ck(
   TORCH_CHECK(key.size(1) == value.size(1));
 
   // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
 
   // Embedding per head
   TORCH_CHECK(query.size(3) == key.size(3));
@@ -105,7 +105,8 @@ efficient_attention_forward_ck(
   int64_t B = query.size(0);
   int64_t M = query.size(1);
   int64_t N = key.size(1);
-  int64_t num_heads = query.size(-2);
+  int64_t Hq = query.size(-2);
+  int64_t Hkv = key.size(-2);
   int64_t K = query.size(-1);
   int64_t Kv = value.size(-1);
 
@@ -113,7 +114,7 @@ efficient_attention_forward_ck(
 
   at::Tensor logsumexp;
 
-  at::Tensor out = at::empty({B, M, num_heads, Kv}, opts);
+  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
 
   const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
   int64_t philox_seed;
@@ -128,7 +129,7 @@ efficient_attention_forward_ck(
     std::lock_guard<std::mutex> lock(gen->mutex_);
     // if using dropout, we produce 1 random number for each element of the
     // attention tensor
-    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
 
     const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
 
@@ -140,7 +141,8 @@ efficient_attention_forward_ck(
     p.B = B;
     p.M = M;
     p.N = N;
-    p.num_heads = num_heads;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
     p.K = K;
     p.Kv = Kv;
 
@@ -184,7 +186,7 @@ efficient_attention_forward_ck(
       p.attn_bias_ptr = bias->data_ptr();
 
       const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
+          get_bias_4d_view(*bias, B, Hq, M, N);
       p.attn_bias_strides = {
           static_cast<int>(bias_4d_view.stride(0)),
           static_cast<int>(bias_4d_view.stride(1)),
@@ -207,7 +209,7 @@ efficient_attention_forward_ck(
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
-      logsumexp = at::empty({B, num_heads, M}, opts.dtype(at::kFloat));
+      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
     } else
       p.logsumexp_ptr = nullptr;
@@ -217,7 +219,8 @@ efficient_attention_forward_ck(
     p.num_batches = seqstart_q->size(0) - 1;
     p.M = M;
     p.N = N;
-    p.num_heads = num_heads;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
     p.K = K;
     p.Kv = Kv;
 
@@ -250,7 +253,7 @@ efficient_attention_forward_ck(
 
       p.has_attn_bias = true;
       const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
+          get_bias_4d_view(*bias, B, Hq, M, N);
       p.attn_bias_strides = {
           static_cast<int>(bias_4d_view.stride(0)),
           static_cast<int>(bias_4d_view.stride(1)),
@@ -343,12 +346,12 @@ efficient_attention_forward_ck(
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty(
-          {p.num_batches, num_heads, p.max_seqlen_q}, opts.dtype(at::kFloat));
+          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
       char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
 
       for (int i = 0; i < p.num_batches; i++) {
         size_t tmp_logsumexp_offset = get_size_in_bytes(
-            static_cast<size_t>(i) * num_heads * p.max_seqlen_q,
+            static_cast<size_t>(i) * Hq * p.max_seqlen_q,
             logsumexp.scalar_type());
         p.logsumexp_ptrs.push_back(
             reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 9fd8e06e0..9de59b5bd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -379,7 +379,7 @@ struct batched_backward_masktype_attnbias_dispatched {
       BatchedBackwardParams& param,
       hipStream_t stream) {
     std::vector<ck::index_t> q_gs_ms_ks_lengths{
-        param.B, param.num_heads, param.M, param.K};
+        param.B, param.Hq, param.M, param.K};
     std::vector<ck::index_t> q_gs_ms_ks_strides{
         param.q_strides[0],
         param.q_strides[2],
@@ -387,45 +387,52 @@ struct batched_backward_masktype_attnbias_dispatched {
         param.q_strides[3]};
 
     std::vector<ck::index_t> k_gs_ns_ks_lengths{
-        param.B, param.num_heads, param.N, param.K};
+        param.B, param.Hkv, param.N, param.K};
     std::vector<ck::index_t> k_gs_ns_ks_strides{
         param.k_strides[0],
         param.k_strides[2],
         param.k_strides[1],
         param.k_strides[3]};
 
-    // ToDo: support multi-query and group-query attention
-    std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = k_gs_ns_ks_lengths;
-    std::vector<ck::index_t> kgrad_gs_ns_ks_strides = k_gs_ns_ks_strides;
+    std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {
+        param.B, param.Hq, param.N, param.K};
+    std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
+        param.tmp_grad_k_strides[0],
+        param.tmp_grad_k_strides[2],
+        param.tmp_grad_k_strides[1],
+        param.tmp_grad_k_strides[3]};
 
     std::vector<ck::index_t> v_gs_os_ns_lengths{
-        param.B, param.num_heads, param.Kv, param.N};
+        param.B, param.Hkv, param.Kv, param.N};
     std::vector<ck::index_t> v_gs_os_ns_strides{
         param.v_strides[0],
         param.v_strides[2],
         param.v_strides[3],
         param.v_strides[1]};
 
-    // ToDo: support multi-query and group-query attention
-    std::vector<ck::index_t> vgrad_gs_os_ns_lengths = v_gs_os_ns_lengths;
-    std::vector<ck::index_t> vgrad_gs_os_ns_strides = v_gs_os_ns_strides;
+    std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {
+        param.B, param.Hq, param.Kv, param.N};
+    std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
+        param.tmp_grad_v_strides[0],
+        param.tmp_grad_v_strides[2],
+        param.tmp_grad_v_strides[3],
+        param.tmp_grad_v_strides[1]};
 
     std::vector<ck::index_t> y_gs_ms_os_lengths{
-        param.B, param.num_heads, param.M, param.Kv};
+        param.B, param.Hq, param.M, param.Kv};
     std::vector<ck::index_t> y_gs_ms_os_strides{
         param.out_strides[0],
         param.out_strides[2],
         param.out_strides[1],
         param.out_strides[3]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{
-        param.B, param.num_heads, param.M};
+    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;
     std::vector<ck::index_t> d_gs_ms_ns_strides;
 
     if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
       d_gs_ms_ns_strides = {
           param.attn_bias_strides[0],
           param.attn_bias_strides[1],
@@ -467,10 +474,10 @@ struct batched_backward_masktype_attnbias_dispatched {
         y_gs_ms_os_lengths, // y, dY should have same shape
         y_gs_ms_os_strides,
         lse_gs_ms_lengths,
-        kgrad_gs_ns_ks_lengths,
-        kgrad_gs_ns_ks_strides,
-        vgrad_gs_os_ns_lengths,
-        vgrad_gs_os_ns_strides,
+        param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
+        param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
+        param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
+        param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
         d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
         d_gs_ms_ns_strides,
         {}, // acc1_biases_gs_ms_os_lengths
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 34f748aa7..b73271ada 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -260,7 +260,7 @@ struct batched_forward_masktype_attnbias_dispatched {
   template <typename DeviceOpInstance>
   static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
     std::vector<ck::index_t> a_gs_ms_ks_lengths{
-        param.B, param.num_heads, param.M, param.K};
+        param.B, param.Hq, param.M, param.K};
     std::vector<ck::index_t> a_gs_ms_ks_strides{
         param.q_strides[0],
         param.q_strides[2],
@@ -268,7 +268,7 @@ struct batched_forward_masktype_attnbias_dispatched {
         param.q_strides[3]};
 
     std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-        param.B, param.num_heads, param.N, param.K};
+        param.B, param.Hkv, param.N, param.K};
     std::vector<ck::index_t> b0_gs_ns_ks_strides{
         param.k_strides[0],
         param.k_strides[2],
@@ -277,7 +277,7 @@ struct batched_forward_masktype_attnbias_dispatched {
 
     // to be changed to b1_gs_ns_os_lengths
     std::vector<ck::index_t> b1_gs_os_ns_lengths{
-        param.B, param.num_heads, param.Kv, param.N};
+        param.B, param.Hkv, param.Kv, param.N};
     std::vector<ck::index_t> b1_gs_os_ns_strides{
         param.v_strides[0],
         param.v_strides[2],
@@ -285,21 +285,20 @@ struct batched_forward_masktype_attnbias_dispatched {
         param.v_strides[1]};
 
     std::vector<ck::index_t> c_gs_ms_os_lengths{
-        param.B, param.num_heads, param.M, param.Kv};
+        param.B, param.Hq, param.M, param.Kv};
     std::vector<ck::index_t> c_gs_ms_os_strides{
         param.out_strides[0],
         param.out_strides[2],
         param.out_strides[1],
         param.out_strides[3]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{
-        param.B, param.num_heads, param.M};
+    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;
     std::vector<ck::index_t> d_gs_ms_ns_strides;
 
     if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
       d_gs_ms_ns_strides = {
           param.attn_bias_strides[0],
           param.attn_bias_strides[1],
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 639d333c5..adf04e82a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -240,7 +240,7 @@ struct batched_infer_masktype_attnbias_dispatched {
   template <typename DeviceOpInstance>
   static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
     std::vector<ck::index_t> a_gs_ms_ks_lengths{
-        param.B, param.num_heads, param.M, param.K};
+        param.B, param.Hq, param.M, param.K};
     std::vector<ck::index_t> a_gs_ms_ks_strides{
         param.q_strides[0],
         param.q_strides[2],
@@ -248,7 +248,7 @@ struct batched_infer_masktype_attnbias_dispatched {
         param.q_strides[3]};
 
     std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-        param.B, param.num_heads, param.N, param.K};
+        param.B, param.Hkv, param.N, param.K};
     std::vector<ck::index_t> b0_gs_ns_ks_strides{
         param.k_strides[0],
         param.k_strides[2],
@@ -257,7 +257,7 @@ struct batched_infer_masktype_attnbias_dispatched {
 
     // to be changed to b1_gs_ns_os_lengths
     std::vector<ck::index_t> b1_gs_os_ns_lengths{
-        param.B, param.num_heads, param.Kv, param.N};
+        param.B, param.Hkv, param.Kv, param.N};
     std::vector<ck::index_t> b1_gs_os_ns_strides{
         param.v_strides[0],
         param.v_strides[2],
@@ -265,21 +265,20 @@ struct batched_infer_masktype_attnbias_dispatched {
         param.v_strides[1]};
 
     std::vector<ck::index_t> c_gs_ms_os_lengths{
-        param.B, param.num_heads, param.M, param.Kv};
+        param.B, param.Hq, param.M, param.Kv};
     std::vector<ck::index_t> c_gs_ms_os_strides{
         param.out_strides[0],
         param.out_strides[2],
         param.out_strides[1],
         param.out_strides[3]};
 
-    std::vector<ck::index_t> lse_gs_ms_lengths{
-        param.B, param.num_heads, param.M};
+    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
 
     std::vector<ck::index_t> d_gs_ms_ns_lengths;
     std::vector<ck::index_t> d_gs_ms_ns_strides;
 
     if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.num_heads, param.M, param.N};
+      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
       d_gs_ms_ns_strides = {
           param.attn_bias_strides[0],
           param.attn_bias_strides[1],
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 85f97931f..b3d5d917f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -381,41 +381,48 @@ struct grouped_backward_masktype_attnbias_dispatched {
           : param.host_seqlen_k[i];
       int K = param.K;
       int Kv = param.Kv;
-      int G1 = param.num_heads;
+      int G1q = param.Hq;
+      int G1kv = param.Hkv;
 
-      std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1, M, K};
+      std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1q, M, K};
       std::vector<ck::index_t> q_gs_ms_ks_strides{
           0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
 
-      std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1, N, K};
+      std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1kv, N, K};
       std::vector<ck::index_t> k_gs_ns_ks_strides{
           0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
 
-      // ToDo: support multi-query and group-query attention
-      std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = k_gs_ns_ks_lengths;
-      std::vector<ck::index_t> kgrad_gs_ns_ks_strides = k_gs_ns_ks_strides;
+      std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {1, G1q, N, K};
+      std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
+          0,
+          param.tmp_grad_k_strides[1],
+          param.tmp_grad_k_strides[0],
+          param.tmp_grad_k_strides[2]};
 
       // to be changed to v_gs_ns_os_lengths
-      std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1, Kv, N};
+      std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1kv, Kv, N};
       std::vector<ck::index_t> v_gs_os_ns_strides{
           0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
 
-      // ToDo: support multi-query and group-query attention
-      std::vector<ck::index_t> vgrad_gs_os_ns_lengths = v_gs_os_ns_lengths;
-      std::vector<ck::index_t> vgrad_gs_os_ns_strides = v_gs_os_ns_strides;
+      std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {1, G1q, Kv, N};
+      std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
+          0,
+          param.tmp_grad_v_strides[1],
+          param.tmp_grad_v_strides[2],
+          param.tmp_grad_v_strides[0]};
 
-      std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1, M, Kv};
+      std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1q, M, Kv};
       std::vector<ck::index_t> y_gs_ms_os_strides{
           0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
       std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
 
       std::vector<ck::index_t> d_gs_ms_ns_lengths;
       std::vector<ck::index_t> d_gs_ms_ns_strides;
 
       if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1, M, N};
+        d_gs_ms_ns_lengths = {1, G1q, M, N};
         d_gs_ms_ns_strides = {
             0,
             param.attn_bias_strides[0],
@@ -440,10 +447,10 @@ struct grouped_backward_masktype_attnbias_dispatched {
           y_gs_ms_os_strides,
           lse_gs_ms_lengths,
           lse_gs_ms_strides,
-          kgrad_gs_ns_ks_lengths,
-          kgrad_gs_ns_ks_strides,
-          vgrad_gs_os_ns_lengths,
-          vgrad_gs_os_ns_strides,
+          param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
+          param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
+          param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
+          param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
           d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
           d_gs_ms_ns_strides,
           {}, // acc1_biases_gs_ms_os_lengths
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 9f22b7e28..3fda4797b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -253,33 +253,34 @@ struct grouped_forward_masktype_attnbias_dispatched {
           : param.host_seqlen_k[i];
       int K = param.K;
       int Kv = param.Kv;
-      int G1 = param.num_heads;
+      int G1q = param.Hq;
+      int G1kv = param.Hkv;
 
-      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
+      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
       std::vector<ck::index_t> a_gs_ms_ks_strides{
           0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
 
-      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1, N, K};
+      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
       std::vector<ck::index_t> b0_gs_ns_ks_strides{
           0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
 
       // to be changed to b1_gs_ns_os_lengths
-      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1, Kv, N};
+      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
       std::vector<ck::index_t> b1_gs_os_ns_strides{
           0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
 
-      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1, M, Kv};
+      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
       std::vector<ck::index_t> c_gs_ms_os_strides{
           0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1, M};
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
       std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
 
       std::vector<ck::index_t> d_gs_ms_ns_lengths;
       std::vector<ck::index_t> d_gs_ms_ns_strides;
 
       if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1, M, N};
+        d_gs_ms_ns_lengths = {1, G1q, M, N};
         d_gs_ms_ns_strides = {
             0,
             param.attn_bias_strides[0],
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index dba421a7b..1b907d370 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -248,22 +248,23 @@ struct grouped_infer_masktype_attnbias_dispatched {
           : param.host_seqlen_k[i];
       int K = param.K;
       int Kv = param.Kv;
-      int G1 = param.num_heads;
+      int G1q = param.Hq;
+      int G1kv = param.Hkv;
 
-      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1, M, K};
+      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
       std::vector<ck::index_t> a_gs_ms_ks_strides{
           0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
 
-      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1, N, K};
+      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
       std::vector<ck::index_t> b0_gs_ns_ks_strides{
           0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
 
       // to be changed to b1_gs_ns_os_lengths
-      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1, Kv, N};
+      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
       std::vector<ck::index_t> b1_gs_os_ns_strides{
           0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
 
-      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1, M, Kv};
+      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
       std::vector<ck::index_t> c_gs_ms_os_strides{
           0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
 
@@ -271,7 +272,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
       std::vector<ck::index_t> d_gs_ms_ns_strides;
 
       if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1, M, N};
+        d_gs_ms_ns_lengths = {1, G1q, M, N};
         d_gs_ms_ns_strides = {
             0,
             param.attn_bias_strides[0],
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 2778da001..7f86dd904 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -7,7 +7,8 @@ struct BatchedInferParams {
   int B; // batch size
   int M; // seq_len for Query
   int N; // seq_len for Key and Value
-  int num_heads; //
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
   int K; // embed_dim for Query and Key
   int Kv; // embed_dim for Value
 
@@ -47,7 +48,8 @@ struct GroupedInferParams {
   int num_batches;
   int M; // total seq_len for all queries in the batch
   int N; // total seq_len for all keys/values in the batch
-  int num_heads; //
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
   int K; // embed_dim for Query and Key
   int Kv; // embed_dim for Value
 
@@ -97,7 +99,8 @@ struct BatchedBackwardParams {
   int B; // batch size
   int M; // seq_len for Query
   int N; // seq_len for Key and Value
-  int num_heads; //
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
   int K; // embed_dim for Query and Key
   int Kv; // embed_dim for Value
 
@@ -106,6 +109,7 @@ struct BatchedBackwardParams {
   bool bias_has_grad;
 
   bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
 
   // BMHK mode strides, last-dim contiguous
   std::array<int, 4> q_strides;
@@ -114,6 +118,9 @@ struct BatchedBackwardParams {
   std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
   std::array<int, 4> out_strides;
 
+  std::array<int, 4> tmp_grad_k_strides;
+  std::array<int, 4> tmp_grad_v_strides;
+
   const void* q_ptr;
   const void* k_ptr;
   const void* v_ptr;
@@ -140,7 +147,8 @@ struct GroupedBackwardParams {
   int num_batches;
   int M; // total seq_len for all queries in the batch
   int N; // total seq_len for all keys/values in the batch
-  int num_heads; //
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
   int K; // embed_dim for Query and Key
   int Kv; // embed_dim for Value
 
@@ -155,6 +163,7 @@ struct GroupedBackwardParams {
   bool bias_has_grad;
 
   bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
 
   // MHK mode strides, last-dim contiguous
   std::array<int, 3> q_strides;
@@ -164,6 +173,9 @@ struct GroupedBackwardParams {
   // 4d tensor view [B, H, M, N]
   std::array<int, 4> attn_bias_strides;
 
+  std::array<int, 3> tmp_grad_k_strides;
+  std::array<int, 3> tmp_grad_v_strides;
+
   std::vector<const void*> q_ptrs;
   std::vector<const void*> k_ptrs;
   std::vector<const void*> v_ptrs;

From 50b829e8c07378e2d8e56c79c2747ae4341c26e8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 29 Oct 2023 19:09:30 +0000
Subject: [PATCH 128/641] [Performance] update to the infer gemm constants

---
 .../hip_fmha/ck_fmha_infer_gemm_constants.h   | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
index b80dc9412..fbebac6f1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -13,27 +13,27 @@ struct GemmOpConstantsBatchedInfer {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t AK1 = 4;
+  static constexpr ck::index_t BK1 = 4;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
   static constexpr ck::index_t MXdlPerWave = 1;
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
   static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
   using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
@@ -61,27 +61,27 @@ struct GemmOpConstantsGroupedInfer {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t AK1 = 4;
+  static constexpr ck::index_t BK1 = 4;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
   static constexpr ck::index_t MXdlPerWave = 1;
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector,
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
   static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
   using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
@@ -98,7 +98,3 @@ struct GemmOpConstantsGroupedInfer {
   // static constexpr ck::index_t
   // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
-
-struct GemmOpConstantsForward {};
-
-struct GemmOpConstantsBackward {};

From d12d0aaa37b01d340eecbd0f4332a82bb7428d3f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 29 Oct 2023 21:03:40 +0000
Subject: [PATCH 129/641] [Performance] update to the forward gemm constants

---
 .../hip_fmha/ck_fmha_batched_forward.h        |  2 +-
 .../hip_fmha/ck_fmha_forward_gemm_constants.h | 24 +++++++++----------
 .../hip_fmha/ck_fmha_grouped_forward.h        |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index b73271ada..4eb949b9e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -188,7 +188,7 @@ struct batched_forward_masktype_attnbias_dispatched {
         "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
 
     constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(2, thread_slice_length_ak1);
+        min(4, thread_slice_length_ak1);
 
     BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index 992a4c4b2..69e2bc520 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -13,8 +13,8 @@ struct GemmOpConstantsBatchedForward {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t AK1 = 4;
+  static constexpr ck::index_t BK1 = 4;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
@@ -22,19 +22,19 @@ struct GemmOpConstantsBatchedForward {
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
   static constexpr ck::index_t DropoutStep = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
   static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
   using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
@@ -64,8 +64,8 @@ struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t AK1 = 4;
+  static constexpr ck::index_t BK1 = 4;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
@@ -73,19 +73,19 @@ struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
   static constexpr ck::index_t DropoutStep = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
   static constexpr bool BBlockLdsExtraN = true;
   // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
   using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 3fda4797b..481c1a01d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -173,7 +173,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
         "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
 
     constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(2, thread_slice_length_ak1);
+        min(4, thread_slice_length_ak1);
 
     GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /

From a36f81a0f9beb1961040e1e131b6574e4f9c87cf Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 30 Oct 2023 22:24:09 +0000
Subject: [PATCH 130/641] Update forward gemm constants and max vector-size of
 CShuffled output to reduce compiling-time

---
 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h    | 2 +-
 .../csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h  | 4 ++--
 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 4eb949b9e..7959bb088 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -205,7 +205,7 @@ struct batched_forward_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       if constexpr (
           kB1BlockTransferSrcScalarPerVector_max >=
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index 69e2bc520..5a1790b5f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -47,7 +47,7 @@ struct GemmOpConstantsBatchedForward {
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
   using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 32, 1, 8>;
+      S<1, 16, 1, 16>;
   // static constexpr ck::index_t
   // CShuffleBlockTransferScalarPerVector_NPerBlock;
   static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector =
@@ -98,7 +98,7 @@ struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
   using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 32, 1, 8>;
+      S<1, 16, 1, 16>;
   // static constexpr ck::index_t
   // CShuffleBlockTransferScalarPerVector_NPerBlock;
   static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector =
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 481c1a01d..3e388414b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -190,7 +190,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
+          min(1, thread_slice_length_cshuflle_n);
 
       if constexpr (
           kB1BlockTransferSrcScalarPerVector_max >=

From 027c10eb00284954e4bd93b1c9674fd46218b9b2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 30 Oct 2023 23:03:00 +0000
Subject: [PATCH 131/641] [Performance] tiny adjustment to the infer gemm
 constants

---
 .../hip_fmha/ck_fmha_infer_gemm_constants.h      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
index fbebac6f1..8f492ff00 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -5,6 +5,7 @@
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
+// clang-format off
 struct GemmOpConstantsBatchedInfer {
   static constexpr ck::index_t NumGemmKPrefetchStage = 1;
   static constexpr ck::index_t BlockSize = 256;
@@ -45,14 +46,14 @@ struct GemmOpConstantsBatchedInfer {
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 32, 1, 8>;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 16, 1, 16>;
+  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
+//clang-format on
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
+// clang-format off
 struct GemmOpConstantsGroupedInfer {
   static constexpr ck::index_t NumGemmKPrefetchStage = 1;
   static constexpr ck::index_t BlockSize = 256;
@@ -93,8 +94,7 @@ struct GemmOpConstantsGroupedInfer {
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 32, 1, 8>;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 16, 1, 16>;
+  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
+// clang-format on

From 71e302f63f6e967f14a96e777b21b5394eed8d23 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 20 Sep 2023 14:07:41 -0400
Subject: [PATCH 132/641] update requirement for running tests

scipy.stats.binomtest needs v1.7 or newer
---
 requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 3d4a840a9..e077f5579 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -25,7 +25,7 @@ hydra-core >= 1.1
 
 # Dependency for Mixture of Experts
 fairscale >= 0.4.5
-scipy
+scipy >= 1.7
 
 # Dependency for fused layers, optional
 cmake

From dbd6b81b584457f586c4504b6485bbe34d19de92 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 25 Sep 2023 16:32:46 -0400
Subject: [PATCH 133/641] verbose skip reason when testing decoder

---
 tests/test_mem_eff_attention_ck.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 787c9b3f2..38ef4b389 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1651,8 +1651,8 @@ def test_decoder(
         kv_padding=padding,
     )
     inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
-    if not op.supports(inp):
-        pytest.skip("not supported")
+    if (not_supported_reasons := op.not_supported_reasons(inp)):
+        pytest.skip(f"{not_supported_reasons=}")
 
     decoder_output = fmha.memory_efficient_attention_forward(
         q, k, v, attn_bias, op=fmha.decoder.FwOp

From 5eaa606ad91d4b3b4848b1b10f448f667d622024 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 27 Sep 2023 13:55:00 -0400
Subject: [PATCH 134/641] make another instance of case skipping verbose about
 the reasons

---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index a44c81891..d63c79833 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -126,7 +126,8 @@ def mem_eff_attention_decoder(
     has_run = False
     for fw_op in OPS:
         inp = fmha.Inputs(q, k, v, attn_bias=bias)
-        if not fw_op.supports(inp):
+        if (skip_reasons := fw_op.not_supported_reasons(inp)):
+            print(f"Skip benchmark: {skip_reasons=}")
             continue
 
         fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)

From 88d631ba7d5defa93b23d1e05db0bc1daa9d686d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 27 Sep 2023 14:39:59 -0400
Subject: [PATCH 135/641] add cpp boilerplate for the decoder op

---
 tests/test_mem_eff_attention_ck.py            | 19 ++--
 xformers/csrc/attention/attention.cpp         |  8 +-
 .../hip_fmha/attention_forward_generic.cpp    | 31 +++++++
 xformers/ops/fmha/__init__.py                 |  4 +-
 xformers/ops/fmha/ck_decoder.py               | 91 +++++++++++++++++++
 5 files changed, 141 insertions(+), 12 deletions(-)
 create mode 100644 xformers/ops/fmha/ck_decoder.py

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 38ef4b389..a3c363fe0 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1618,7 +1618,7 @@ def test_attn_bias_padded() -> None:
     )
 
 
-@pytest.mark.parametrize("op", [fmha.decoder.FwOp])
+@pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
 @pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "")
 @pytest.mark.parametrize("n_heads", [1, 16, 32])
 @pytest.mark.parametrize("padding", [32, 4096])
@@ -1627,7 +1627,7 @@ def test_attn_bias_padded() -> None:
 def test_decoder(
     op, multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str
 ) -> None:
-    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
+    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
     torch.manual_seed(1)
     d = 128
     k_shape = (1, bsz * padding, n_heads, d)
@@ -1655,17 +1655,16 @@ def test_decoder(
         pytest.skip(f"{not_supported_reasons=}")
 
     decoder_output = fmha.memory_efficient_attention_forward(
-        q, k, v, attn_bias, op=fmha.decoder.FwOp
+        q, k, v, attn_bias, op=op
     )
+    
+    ref_output = ref_attention(q, k, v, attn_bias)
 
-    ck_output = fmha.memory_efficient_attention_forward(
-        q, k, v, attn_bias, op=fmha.ck.FwOp
-    )
     assert_allclose(
-        decoder_output,
-        ck_output,
-        atol=fmha.ck.FwOp.ERROR_ATOL[dtype_] * 4,
-        rtol=fmha.ck.FwOp.ERROR_RTOL[dtype_],
+        decoder_output.float(),
+        ref_output,
+        atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
+        rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
     )
 
 
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 18ddcdcfc..b3fdde526 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -39,7 +39,13 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
 #endif
 #if defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_ck(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
+      "xformers::efficient_attention_forward_ck(Tensor query, "
+          "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
+          "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
+          "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
+          "Tensor key, Tensor value, Tensor seq_positions, float scale) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index aaafa1b3b..7a58cc931 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -408,10 +408,41 @@ efficient_attention_forward_ck(
   return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
+at::Tensor
+efficient_attention_forward_decoder_ck(
+    const at::Tensor& XQ, // [B, 1, H, D]
+    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+    const at::Tensor& seq_positions, // [B]
+    double qk_scale) {
+
+  constexpr int32_t kThreadsPerWarp = 32;
+  constexpr int32_t kWarpsPerBlock = 32;
+  constexpr int32_t D_H = 128;
+  constexpr int32_t T_MAX = 8192;
+
+  at::OptionalDeviceGuard guard(XQ.device());
+  TORCH_CHECK(XQ.is_cuda());
+  TORCH_CHECK(cache_K.is_cuda());
+  TORCH_CHECK(cache_V.is_cuda());
+
+  TORCH_CHECK(seq_positions.is_cuda());
+
+  TORCH_CHECK(cache_K.size(1) <= T_MAX);
+  TORCH_CHECK(cache_K.size(3) == D_H);
+
+  auto O = at::randn_like(XQ);
+  return O;
+}  
+
 } // namespace
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
       TORCH_FN(efficient_attention_forward_ck));
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
+      TORCH_FN(efficient_attention_forward_decoder_ck));
 }
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 0e5cd131e..9c2733f07 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from . import cutlass, decoder, flash, small_k, triton, ck
+from . import cutlass, decoder, flash, small_k, triton, ck, ck_decoder
 from .attn_bias import AttentionBias, BlockDiagonalMask, LowerTriangularMask
 from .common import (
     AttentionBwOpBase,
@@ -30,6 +30,7 @@
 MemoryEfficientAttentionOp = (small_k.FwOp, small_k.BwOp)
 TritonFlashAttentionOp = (triton.FwOp, triton.BwOp)
 MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp) 
+MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
 
 class _fMHA(torch.autograd.Function):
     @staticmethod
@@ -412,6 +413,7 @@ def _memory_efficient_attention_backward(
     "TritonFlashAttentionOp",
     "memory_efficient_attention",
     "MemoryEfficientAttentionCkOp",
+    "MemoryEfficientAttentionCkDecoderOp",
     "ALL_FW_OPS",
     "ALL_BW_OPS",
 ]
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
new file mode 100644
index 000000000..1a5eba6f3
--- /dev/null
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -0,0 +1,91 @@
+# TODO(max): add a proper copyright header
+import math
+import torch
+
+from typing import Any, Set, List, Tuple, Optional
+from .attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
+from .common import AttentionFwOpBase, Context, Inputs
+from ..common import get_xformers_operator, register_operator
+
+@register_operator
+class FwOp(AttentionFwOpBase):
+    OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_ck")
+    SUPPORTED_DEVICES: Set[str] = {"cuda"}
+    SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
+    SUPPORTED_MAX_K: float = 128
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {BlockDiagonalCausalWithOffsetPaddedKeysMask}
+    SUPPORTS_DROPOUT = False
+    SUPPORTS_CUSTOM_SCALE = True
+    NAME = "ck_decoderF"
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(FwOp, cls).not_supported_reasons(d)
+
+        attn_bias = d.attn_bias
+        if isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            # If we don't get here, we've an error elsewhere
+            if d.query.ndim != 4 or d.key.ndim != 4:
+                reasons.append("Inputs must be BMHK. BMK not supported")
+
+            if d.query.shape[0] != 1:
+                reasons.append("One formal batch element expected")
+
+            if d.query.shape[-1] != 128:
+                reasons.append("Only head_dim==128 for now.")
+
+            if d.key.stride(-1) != 1:
+                reasons.append("expect keys to have last dim contiguous")
+
+            if d.value.stride(-1) != 1:
+                reasons.append("expect values to have last dim contiguous")
+
+            q_starts = attn_bias.q_seqinfo.seqstart_py
+            if attn_bias.q_seqinfo.max_seqlen != 1:
+                reasons.append("decoding expects one query")
+            elif d.query.shape[1] != len(q_starts) - 1:
+                reasons.append("empty lanes not supported yet")
+
+            if attn_bias.k_seqinfo.padding > 8192:
+                reasons.append("key padding exceeds 8192")
+
+        return reasons
+
+    @classmethod
+    def apply(
+        cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        if needs_gradient:
+            raise NotImplementedError("gradient")
+        attn_bias = inp.attn_bias
+        assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+
+        attn_bias.k_seqinfo.to(inp.query.device)
+        attn_bias.q_seqinfo.to(inp.query.device)
+
+        padding = attn_bias.k_seqinfo.padding
+        multiquery = inp.key.stride(2) == 0
+        if multiquery:
+            key = inp.key[0, :, :1].unflatten(0, (-1, padding))
+            value = inp.value[0, :, :1].unflatten(0, (-1, padding))
+        else:
+            key = inp.key[0].unflatten(0, (-1, padding))
+            value = inp.value[0].unflatten(0, (-1, padding))
+
+        seq_positions = attn_bias.k_seqinfo.seqlen
+
+        query = inp.query[0, :, None]
+
+        if inp.scale is not None:
+            qk_scale = inp.scale
+        else:
+            qk_scale = 1.0 / math.sqrt(key.shape[-1])
+
+        out = cls.OPERATOR(
+            query=query,
+            key=key,
+            value=value,
+            seq_positions=seq_positions,
+            scale=qk_scale,
+        )
+        return out, None

From 15cff16a274252b5e142e35a14d5d77b5c6aef69 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 27 Sep 2023 18:30:34 -0400
Subject: [PATCH 136/641] add boilerplate for invoking the kernel

---
 .../hip_fmha/attention_forward_generic.cpp    | 64 ++++++++++++++++++-
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 7a58cc931..e93e11010 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -408,6 +408,30 @@ efficient_attention_forward_ck(
   return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
+template<typename scalar_t>
+__global__ void
+efficient_attention_forward_decoder_ck_kernel(
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K,
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V,
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O,
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
+    float qk_scale
+) {
+  __syncthreads();
+}
+
+#define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) 
+  
+#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                               \
+      TYPE,                                                         \
+      NAME,                                                         \
+      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
 at::Tensor
 efficient_attention_forward_decoder_ck(
     const at::Tensor& XQ, // [B, 1, H, D]
@@ -416,8 +440,8 @@ efficient_attention_forward_decoder_ck(
     const at::Tensor& seq_positions, // [B]
     double qk_scale) {
 
-  constexpr int32_t kThreadsPerWarp = 32;
-  constexpr int32_t kWarpsPerBlock = 32;
+  constexpr int32_t kThreadsPerWavefront = 32;
+  constexpr int32_t kWavefrontsPerBlock = 32;
   constexpr int32_t D_H = 128;
   constexpr int32_t T_MAX = 8192;
 
@@ -431,10 +455,44 @@ efficient_attention_forward_decoder_ck(
   TORCH_CHECK(cache_K.size(1) <= T_MAX);
   TORCH_CHECK(cache_K.size(3) == D_H);
 
-  auto O = at::randn_like(XQ);
+  auto O = at::empty_like(XQ);
+  auto B = XQ.size(0);
+  auto H = XQ.size(2);
+  dim3 blocks(B, H);
+  dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
+
+  int32_t smem_softmax = T_MAX * sizeof(float) + kWavefrontsPerBlock * sizeof(float);
+  int32_t smem_output = D_H * sizeof(float) * kWavefrontsPerBlock;
+  int32_t smem = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Float, 
+    XQ.scalar_type(), "efficient_attention_forward_decoder_ck", [&] {
+      auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
+      if (smem > 48 * 1024) {
+        C10_CUDA_CHECK(hipFuncSetAttribute(
+            reinterpret_cast<void*&>(kernel),
+            hipFuncAttributeMaxDynamicSharedMemorySize,
+            smem));
+      }
+      kernel
+          <<<blocks, threads, smem, stream>>>(
+              XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+              cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+              cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+              O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+              seq_positions
+                  .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+              qk_scale);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+
   return O;
 }  
 
+#undef AT_DISPATCH_CASE_3
+#undef AT_DISPATCH_SWITCH_3
+
 } // namespace
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {

From 0dc57854f1c672280d646e0df4bdbec1af877c21 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 27 Sep 2023 18:48:43 -0400
Subject: [PATCH 137/641] move the decoder op backend to its own file

---
 .../hip_fmha/attention_forward_decoder.cpp    | 104 ++++++++++++++++++
 .../hip_fmha/attention_forward_generic.cpp    |  89 ---------------
 2 files changed, 104 insertions(+), 89 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
new file mode 100644
index 000000000..e23f398a1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -0,0 +1,104 @@
+/* 
+  TODO: license header
+*/
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/library.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+
+namespace {
+
+template<typename scalar_t>
+__global__ void
+efficient_attention_forward_decoder_ck_kernel(
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K,
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V,
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O,
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
+    float qk_scale
+) {
+  __syncthreads();
+}
+
+#define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) 
+  
+#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                               \
+      TYPE,                                                         \
+      NAME,                                                         \
+      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+at::Tensor
+efficient_attention_forward_decoder_ck(
+    const at::Tensor& XQ, // [B, 1, H, D]
+    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+    const at::Tensor& seq_positions, // [B]
+    double qk_scale) {
+
+  constexpr int32_t kThreadsPerWavefront = 32;
+  constexpr int32_t kWavefrontsPerBlock = 32;
+  constexpr int32_t D_H = 128;
+  constexpr int32_t T_MAX = 8192;
+
+  at::OptionalDeviceGuard guard(XQ.device());
+  TORCH_CHECK(XQ.is_cuda());
+  TORCH_CHECK(cache_K.is_cuda());
+  TORCH_CHECK(cache_V.is_cuda());
+
+  TORCH_CHECK(seq_positions.is_cuda());
+
+  TORCH_CHECK(cache_K.size(1) <= T_MAX);
+  TORCH_CHECK(cache_K.size(3) == D_H);
+
+  auto O = at::empty_like(XQ);
+  auto B = XQ.size(0);
+  auto H = XQ.size(2);
+  dim3 blocks(B, H);
+  dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
+
+  int32_t smem_softmax = T_MAX * sizeof(float) + kWavefrontsPerBlock * sizeof(float);
+  int32_t smem_output = D_H * sizeof(float) * kWavefrontsPerBlock;
+  int32_t smem = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Float, 
+    XQ.scalar_type(), "efficient_attention_forward_decoder_ck", [&] {
+      auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
+      if (smem > 48 * 1024) {
+        C10_CUDA_CHECK(hipFuncSetAttribute(
+            reinterpret_cast<void*&>(kernel),
+            hipFuncAttributeMaxDynamicSharedMemorySize,
+            smem));
+      }
+      kernel
+          <<<blocks, threads, smem, stream>>>(
+              XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+              cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+              cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+              O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+              seq_positions
+                  .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+              qk_scale);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+
+  return O;
+}  
+
+#undef AT_DISPATCH_CASE_3
+#undef AT_DISPATCH_SWITCH_3
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
+      TORCH_FN(efficient_attention_forward_decoder_ck));
+}
\ No newline at end of file
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index e93e11010..aaafa1b3b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -408,99 +408,10 @@ efficient_attention_forward_ck(
   return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
-template<typename scalar_t>
-__global__ void
-efficient_attention_forward_decoder_ck_kernel(
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K,
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V,
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
-    float qk_scale
-) {
-  __syncthreads();
-}
-
-#define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
-  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
-  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
-  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) 
-  
-#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(                                               \
-      TYPE,                                                         \
-      NAME,                                                         \
-      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
-
-at::Tensor
-efficient_attention_forward_decoder_ck(
-    const at::Tensor& XQ, // [B, 1, H, D]
-    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_positions, // [B]
-    double qk_scale) {
-
-  constexpr int32_t kThreadsPerWavefront = 32;
-  constexpr int32_t kWavefrontsPerBlock = 32;
-  constexpr int32_t D_H = 128;
-  constexpr int32_t T_MAX = 8192;
-
-  at::OptionalDeviceGuard guard(XQ.device());
-  TORCH_CHECK(XQ.is_cuda());
-  TORCH_CHECK(cache_K.is_cuda());
-  TORCH_CHECK(cache_V.is_cuda());
-
-  TORCH_CHECK(seq_positions.is_cuda());
-
-  TORCH_CHECK(cache_K.size(1) <= T_MAX);
-  TORCH_CHECK(cache_K.size(3) == D_H);
-
-  auto O = at::empty_like(XQ);
-  auto B = XQ.size(0);
-  auto H = XQ.size(2);
-  dim3 blocks(B, H);
-  dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
-
-  int32_t smem_softmax = T_MAX * sizeof(float) + kWavefrontsPerBlock * sizeof(float);
-  int32_t smem_output = D_H * sizeof(float) * kWavefrontsPerBlock;
-  int32_t smem = max(smem_softmax, smem_output);
-  auto stream = at::cuda::getCurrentHIPStream().stream();
-
-  AT_DISPATCH_SWITCH_3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Float, 
-    XQ.scalar_type(), "efficient_attention_forward_decoder_ck", [&] {
-      auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
-      if (smem > 48 * 1024) {
-        C10_CUDA_CHECK(hipFuncSetAttribute(
-            reinterpret_cast<void*&>(kernel),
-            hipFuncAttributeMaxDynamicSharedMemorySize,
-            smem));
-      }
-      kernel
-          <<<blocks, threads, smem, stream>>>(
-              XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-              cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-              cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-              O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-              seq_positions
-                  .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-              qk_scale);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-  });
-
-  return O;
-}  
-
-#undef AT_DISPATCH_CASE_3
-#undef AT_DISPATCH_SWITCH_3
-
 } // namespace
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
       TORCH_FN(efficient_attention_forward_ck));
-
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
-      TORCH_FN(efficient_attention_forward_decoder_ck));
 }

From 7233f7ee7f7b778fe10894d1694fade67f6250b0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 01:00:01 -0400
Subject: [PATCH 138/641] do a manual hipification pass on the decoder kernel

---
 tests/test_mem_eff_attention_ck.py            |   2 +-
 .../hip_fmha/attention_forward_decoder.cpp    | 354 +++++++++++++++++-
 xformers/ops/fmha/ck_decoder.py               |   8 +-
 3 files changed, 354 insertions(+), 10 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index a3c363fe0..c4240d21c 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1629,7 +1629,7 @@ def test_decoder(
 ) -> None:
     dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
     torch.manual_seed(1)
-    d = 128
+    d = 256
     k_shape = (1, bsz * padding, n_heads, d)
     # TODO: support 2 kv heads etc.
     k = torch.randn(k_shape, dtype=dtype_).cuda()
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index e23f398a1..40c7323fd 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -2,14 +2,146 @@
   TODO: license header
 */
 
+// #include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/inner_product.hpp>
 #include <c10/cuda/CUDAStream.h>
 #include <torch/library.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
 
+namespace ck {
+template <>
+__device__ void inner_product<bhalf_t, bhalf_t, float>(const bhalf_t& a, const bhalf_t& b, float& c)
+{
+    inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+__device__ void inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, const bhalf4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I0],
+                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I0],
+                  c);
+
+    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I1],
+                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I1],
+                  c);
+
+    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I2],
+                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I2],
+                  c);
+
+    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I3],
+                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I3],
+                  c);
+}
+} // namespace ck
+
 namespace {
 
+constexpr int32_t kThreadsPerWavefront = 64;
+constexpr int32_t kWavefrontsPerBlock = 16;
+constexpr int32_t D_H = 256;
+constexpr int32_t T_MAX = 8192;
+
+// read 4 elements in one instruction
+template<typename c10_t>
+struct c10_to_read_t;
+
+template<>
+struct c10_to_read_t<float> {
+  using type = uint4;
+};
+
+template<>
+struct c10_to_read_t<c10::Half> {
+  using type = uint2;
+};
+
+template<>
+struct c10_to_read_t<c10::BFloat16> {
+  using type = uint2;
+};
+
+template<typename c10_t>
+struct c10_to_data_t;
+
+template<>
+struct c10_to_data_t<float> {
+    using type = float_t;
+    using vec4 = ck::float4_t;
+};
+
+template<>
+struct c10_to_data_t<c10::Half> {
+    using type = ck::half_t;
+    using vec4 = ck::half4_t;
+};
+
+template<>
+struct c10_to_data_t<c10::BFloat16> {
+    using type = ck::bhalf_t;
+    using vec4 = ck::bhalf4_t;
+};
+
+template<typename read_t, typename data_t>
+__device__
+float4 scalar4_scale_acc(float4 acc, const read_t* ra, float b);
+
+template<>
+__device__  
+float4
+scalar4_scale_acc<uint4, float>(float4 acc, const uint4* ra, float b) {
+  const auto* a = reinterpret_cast<const float4*>(ra);
+  acc.x += a->x * b;
+  acc.y += a->y * b;
+  acc.z += a->z * b;
+  acc.w += a->w * b;
+  return acc;
+}
+
+template<>
+__device__  
+float4
+scalar4_scale_acc<uint2, ck::half_t>(float4 acc, const uint2* ra, float b) {
+  const auto* a = reinterpret_cast<const ck::half4_t*>(ra);
+  acc.x += a->x * b;
+  acc.y += a->y * b;
+  acc.z += a->z * b;
+  acc.w += a->w * b;
+  return acc;
+}
+
+template<>
+__device__  
+float4
+scalar4_scale_acc<uint2, ck::bhalf_t>(float4 acc, const uint2* ra, float b) {
+  const auto* a = reinterpret_cast<const ck::bhalf4_t*>(ra);
+  acc.x += a->x * b;
+  acc.y += a->y * b;
+  acc.z += a->z * b;
+  acc.w += a->w * b;
+  return acc;
+}
+
+template <typename F>
+float
+__device__ wavefrontReduce(float val) {
+  auto reducer = F();  
+#pragma unroll
+  for (uint mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
+    val = reducer(val, __shfl_xor(val, mask, kThreadsPerWavefront));
+  }
+  return val;
+}
+
 template<typename scalar_t>
 __global__ void
 efficient_attention_forward_decoder_ck_kernel(
@@ -20,7 +152,224 @@ efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
     float qk_scale
 ) {
+  static_assert(4 * kThreadsPerWavefront == D_H, "");
+  static_assert(kWavefrontsPerBlock <= kThreadsPerWavefront, "");
+
+  constexpr int32_t seq_positions_shift = 0;
+
+  extern __shared__ __align__(16) float smem[];
+
+  // Each block handles a single batch and head
+  int32_t b = blockIdx.x;
+  int32_t h = blockIdx.y;
+
+  // Note: this is decoding case where we attend to current and all previous
+  // tokens.
+  int32_t t_max = seq_positions[b] + seq_positions_shift;
+
+  int32_t wavefront_idx = threadIdx.y;
+  // need kWavefrontsPerBlock == blockDim.y;
+  // Need D_H == 128
+  const auto* q_ = &(XQ[b][0][h][0]);
+
+  bool multiquery = cache_K.size(2) == 1;
+  auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
+  auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
+
+  // Load Q into registers in all wavefronts.
+  // Each thread handles 4 D dimensions
+  using read_t = typename c10_to_read_t<scalar_t>::type;
+  using data_t = typename c10_to_data_t<scalar_t>::type;
+  using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
+  const read_t* q_thread = reinterpret_cast<const read_t*>(q_) + threadIdx.x;
+
+  // Each block computes different B value
+  float max_qk_acc = std::numeric_limits<float>::lowest();
+
+  // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+  // Split T across wavefronts in a block, unroll loads to expose more
+  // parallelism.
+
+  constexpr int32_t kTimeUnroll = 1;
+  const read_t* k_loads[kTimeUnroll];
+
+  const int32_t t_max_unroll =
+    (t_max / (kWavefrontsPerBlock * kTimeUnroll)) * (kWavefrontsPerBlock * kTimeUnroll);
+
+  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
+       tt += kWavefrontsPerBlock * kTimeUnroll) {
+#pragma unroll kTimeUnroll
+    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+      int32_t t = tt + ttt;
+      auto* k_ = cache_K_base + t * cache_K.stride(1);
+      // scalar4<scalar_t> k_thread;
+      k_loads[ttt] =
+          reinterpret_cast<const read_t*>(k_) + threadIdx.x;
+    }
+#pragma unroll kTimeUnroll
+    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+      float qk_acc = 0;
+      int32_t t = tt + ttt;
+
+      ck::inner_product<data_vec4_t, data_vec4_t, float>(*reinterpret_cast<const data_vec4_t*>(q_thread), 
+                                                         *reinterpret_cast<const data_vec4_t*>(k_loads[ttt]), 
+                                                         qk_acc);
+      qk_acc *= qk_scale;
+
+      qk_acc = wavefrontReduce<std::plus<float>>(qk_acc);
+      max_qk_acc = max(qk_acc, max_qk_acc);
+
+      // write accumulated sums to smem.
+      if (threadIdx.x == 0) {
+        smem[t] = qk_acc;
+      }
+    }
+  }
+
+  constexpr int32_t kTimeUnroll1 = 1;
+  for (auto tt = t_max_unroll + wavefront_idx; tt < t_max;
+       tt += kWavefrontsPerBlock * kTimeUnroll1) {
+#pragma unroll kTimeUnroll1
+    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+      int32_t t = tt + ttt;
+      // &(cache_K[b][t][0][0]);
+      auto* k_ = cache_K_base + t * cache_K.stride(1);
+      // scalar4<scalar_t> k_thread;
+      k_loads[ttt] =
+          reinterpret_cast<const read_t*>(k_) + threadIdx.x;
+    }
+#pragma unroll kTimeUnroll1
+    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+      float qk_acc = 0;
+      int32_t t = tt + ttt;
+      ck::inner_product<data_vec4_t, data_vec4_t, float>(*reinterpret_cast<const data_vec4_t*>(q_thread), 
+                                                         *reinterpret_cast<const data_vec4_t*>(k_loads[ttt]), 
+                                                         qk_acc);
+      qk_acc *= qk_scale;
+
+      qk_acc = wavefrontReduce<std::plus<float>>(qk_acc);
+      max_qk_acc = max(qk_acc, max_qk_acc);
+
+      // write accumulated sums to smem.
+      if (threadIdx.x == 0) {
+        smem[t] = qk_acc;
+      }
+    }
+  }
+
+  // Use shared reduction to compute max and compute softmax on shared memory.
+  // write max acc
+  if (threadIdx.x == 0) {
+    smem[T_MAX + wavefront_idx] = max_qk_acc;
+  }
+  __syncthreads();
+  if (threadIdx.x < kWavefrontsPerBlock) {
+    max_qk_acc = max(max_qk_acc, smem[T_MAX + threadIdx.x]);
+  }
+  // shared across all threads in block
+  max_qk_acc = wavefrontReduce<std::greater<float>>(max_qk_acc);
+  // each wavefront computes partial sum of exp.
+  float softmax_denominator = 0.0f;
+  for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
+       t += kWavefrontsPerBlock * kThreadsPerWavefront) {
+    softmax_denominator += __expf(smem[t] - max_qk_acc);
+  }
+  softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
+
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    smem[T_MAX + wavefront_idx] = softmax_denominator;
+  }
+  __syncthreads();
+  
+  // now, compute sum of exp(x - max(x)) over all intermediate results.
+  softmax_denominator = 0.0;
+  if (threadIdx.x < kWavefrontsPerBlock) {
+    softmax_denominator = smem[T_MAX + threadIdx.x];
+  }
+  softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
+
+  // now, compute the normalization across all threads.
+  for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
+       t += kWavefrontsPerBlock * kThreadsPerWavefront) {
+    smem[t] = __expf(smem[t] - max_qk_acc) / softmax_denominator;
+  }
+  __syncthreads();
+
+  // Now, we can comute the softmax and write the outputs.
+
+  // Split T across wavefronts in a block
+  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+  // outputs are of size float[D]
+
+  float ps[kTimeUnroll];
+  float4 o_acc;
+  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
+       tt += kWavefrontsPerBlock * kTimeUnroll) {
+#pragma unroll kTimeUnroll
+    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+      int32_t t = tt + ttt;
+      // &(cache_V[b][t][0][0]);
+      auto* v_ = cache_V_base + t * cache_V.stride(1);
+      //   scalar4<scalar_t> v_thread;
+      k_loads[ttt] =
+          reinterpret_cast<const read_t*>(v_) + threadIdx.x;
+      ps[ttt] = smem[t];
+    }
+
+#pragma unroll kTimeUnroll
+    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+      o_acc = scalar4_scale_acc<read_t, data_t>(o_acc, k_loads[ttt], ps[ttt]);
+    }
+  }
+
+
+  for (auto tt = t_max_unroll + wavefront_idx; tt < t_max;
+       tt += kWavefrontsPerBlock * kTimeUnroll1) {
+#pragma unroll kTimeUnroll1
+    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+      int32_t t = tt + ttt;
+      // &(cache_V[b][t][0][0]);
+      auto* v_ = cache_V_base + t * cache_V.stride(1);
+      //   scalar4<scalar_t> v_thread;
+      k_loads[ttt] =
+          reinterpret_cast<const read_t*>(v_) + threadIdx.x;
+      ps[ttt] = smem[t];
+    }
+
+#pragma unroll kTimeUnroll1
+    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+      o_acc = scalar4_scale_acc<read_t, data_t>(o_acc, k_loads[ttt], ps[ttt]);
+    }
+  }
+  // now, each thread has partial sums. Write to smem and get accumulated
+  // results back.
+  __syncthreads();
+
+  *(reinterpret_cast<float4*>(smem) + wavefront_idx * kThreadsPerWavefront +
+    threadIdx.x) = o_acc;
   __syncthreads();
+  // sum up partial D rows from other wavefronts
+  if (wavefront_idx == 0) {
+    float4 r = make_float4(0, 0, 0, 0);
+    for (int32_t w = 0; w < kWavefrontsPerBlock; ++w) {
+      auto partial_r = *(
+          reinterpret_cast<float4*>(smem) + w * kThreadsPerWavefront + threadIdx.x);
+      r.x += partial_r.x;
+      r.y += partial_r.y;
+      r.z += partial_r.z;
+      r.w += partial_r.w;
+    }
+    // write output D row
+    auto* o_ = reinterpret_cast<read_t*>(&O[b][0][h][0]);
+    typename c10_to_data_t<scalar_t>::vec4 bf_r;
+    bf_r.x = r.x;
+    bf_r.y = r.y;
+    bf_r.z = r.z;
+    bf_r.w = r.w;
+    o_[threadIdx.x] =
+        *reinterpret_cast<const read_t*>(&bf_r);
+  }
 }
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
@@ -42,11 +391,6 @@ efficient_attention_forward_decoder_ck(
     const at::Tensor& seq_positions, // [B]
     double qk_scale) {
 
-  constexpr int32_t kThreadsPerWavefront = 32;
-  constexpr int32_t kWavefrontsPerBlock = 32;
-  constexpr int32_t D_H = 128;
-  constexpr int32_t T_MAX = 8192;
-
   at::OptionalDeviceGuard guard(XQ.device());
   TORCH_CHECK(XQ.is_cuda());
   TORCH_CHECK(cache_K.is_cuda());
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 1a5eba6f3..2c7d1ead8 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -11,8 +11,8 @@
 class FwOp(AttentionFwOpBase):
     OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
-    SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
-    SUPPORTED_MAX_K: float = 128
+    SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16, torch.float}
+    SUPPORTED_MAX_K: float = 256
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {BlockDiagonalCausalWithOffsetPaddedKeysMask}
     SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
@@ -31,8 +31,8 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             if d.query.shape[0] != 1:
                 reasons.append("One formal batch element expected")
 
-            if d.query.shape[-1] != 128:
-                reasons.append("Only head_dim==128 for now.")
+            if d.query.shape[-1] != 256:
+                reasons.append("Only head_dim==256 for now.")
 
             if d.key.stride(-1) != 1:
                 reasons.append("expect keys to have last dim contiguous")

From 39d62705b3033c4eda7e1a9830a8fd7827af67be Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:36:00 -0400
Subject: [PATCH 139/641] use type_convert for float arithmetics

---
 .../hip_fmha/attention_forward_decoder.cpp    | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 40c7323fd..1d8e2d4c0 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -112,10 +112,10 @@ __device__
 float4
 scalar4_scale_acc<uint2, ck::half_t>(float4 acc, const uint2* ra, float b) {
   const auto* a = reinterpret_cast<const ck::half4_t*>(ra);
-  acc.x += a->x * b;
-  acc.y += a->y * b;
-  acc.z += a->z * b;
-  acc.w += a->w * b;
+  acc.x += ck::type_convert<float>(a->x) * b;
+  acc.y += ck::type_convert<float>(a->y) * b;
+  acc.z += ck::type_convert<float>(a->z) * b;
+  acc.w += ck::type_convert<float>(a->w) * b;
   return acc;
 }
 
@@ -124,10 +124,10 @@ __device__
 float4
 scalar4_scale_acc<uint2, ck::bhalf_t>(float4 acc, const uint2* ra, float b) {
   const auto* a = reinterpret_cast<const ck::bhalf4_t*>(ra);
-  acc.x += a->x * b;
-  acc.y += a->y * b;
-  acc.z += a->z * b;
-  acc.w += a->w * b;
+  acc.x += ck::type_convert<float>(a->x) * b;
+  acc.y += ck::type_convert<float>(a->y) * b;
+  acc.z += ck::type_convert<float>(a->z) * b;
+  acc.w += ck::type_convert<float>(a->w) * b;
   return acc;
 }
 
@@ -296,7 +296,7 @@ efficient_attention_forward_decoder_ck_kernel(
   }
   __syncthreads();
 
-  // Now, we can comute the softmax and write the outputs.
+  // Now, we can compute the softmax and write the outputs.
 
   // Split T across wavefronts in a block
   // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
@@ -323,7 +323,6 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
-
   for (auto tt = t_max_unroll + wavefront_idx; tt < t_max;
        tt += kWavefrontsPerBlock * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1

From d2fadf08953c0836a6c74caa1664c4156e33aaa4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 12:55:51 -0400
Subject: [PATCH 140/641] bugfix uninitialized float4

---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 1d8e2d4c0..efd0296ba 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -303,7 +303,7 @@ efficient_attention_forward_decoder_ck_kernel(
   // outputs are of size float[D]
 
   float ps[kTimeUnroll];
-  float4 o_acc;
+  float4 o_acc = make_float4(0, 0, 0, 0);
   for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
        tt += kWavefrontsPerBlock * kTimeUnroll) {
 #pragma unroll kTimeUnroll

From 78345f1a51955a517fe1e80ab3235dfee14dbe1e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 13:37:29 -0400
Subject: [PATCH 141/641] reduce the number of casts between internal types

---
 .../hip_fmha/attention_forward_decoder.cpp    | 61 +++++++++----------
 1 file changed, 29 insertions(+), 32 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index efd0296ba..cfd8ace15 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -91,43 +91,40 @@ struct c10_to_data_t<c10::BFloat16> {
     using vec4 = ck::bhalf4_t;
 };
 
-template<typename read_t, typename data_t>
+template<typename data4_t>
 __device__
-float4 scalar4_scale_acc(float4 acc, const read_t* ra, float b);
+float4 scalar4_scale_acc(float4 acc, const data4_t& a, float b);
 
 template<>
 __device__  
 float4
-scalar4_scale_acc<uint4, float>(float4 acc, const uint4* ra, float b) {
-  const auto* a = reinterpret_cast<const float4*>(ra);
-  acc.x += a->x * b;
-  acc.y += a->y * b;
-  acc.z += a->z * b;
-  acc.w += a->w * b;
+scalar4_scale_acc<ck::float4_t>(float4 acc, const ck::float4_t& a, float b) {
+  acc.x += a.x * b;
+  acc.y += a.y * b;
+  acc.z += a.z * b;
+  acc.w += a.w * b;
   return acc;
 }
 
 template<>
 __device__  
 float4
-scalar4_scale_acc<uint2, ck::half_t>(float4 acc, const uint2* ra, float b) {
-  const auto* a = reinterpret_cast<const ck::half4_t*>(ra);
-  acc.x += ck::type_convert<float>(a->x) * b;
-  acc.y += ck::type_convert<float>(a->y) * b;
-  acc.z += ck::type_convert<float>(a->z) * b;
-  acc.w += ck::type_convert<float>(a->w) * b;
+scalar4_scale_acc<ck::half4_t>(float4 acc, const ck::half4_t& a, float b) {
+  acc.x += ck::type_convert<float>(a.x) * b;
+  acc.y += ck::type_convert<float>(a.y) * b;
+  acc.z += ck::type_convert<float>(a.z) * b;
+  acc.w += ck::type_convert<float>(a.w) * b;
   return acc;
 }
 
 template<>
 __device__  
 float4
-scalar4_scale_acc<uint2, ck::bhalf_t>(float4 acc, const uint2* ra, float b) {
-  const auto* a = reinterpret_cast<const ck::bhalf4_t*>(ra);
-  acc.x += ck::type_convert<float>(a->x) * b;
-  acc.y += ck::type_convert<float>(a->y) * b;
-  acc.z += ck::type_convert<float>(a->z) * b;
-  acc.w += ck::type_convert<float>(a->w) * b;
+scalar4_scale_acc<ck::bhalf4_t>(float4 acc, const ck::bhalf4_t& a, float b) {
+  acc.x += ck::type_convert<float>(a.x) * b;
+  acc.y += ck::type_convert<float>(a.y) * b;
+  acc.z += ck::type_convert<float>(a.z) * b;
+  acc.w += ck::type_convert<float>(a.w) * b;
   return acc;
 }
 
@@ -181,7 +178,7 @@ efficient_attention_forward_decoder_ck_kernel(
   using read_t = typename c10_to_read_t<scalar_t>::type;
   using data_t = typename c10_to_data_t<scalar_t>::type;
   using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
-  const read_t* q_thread = reinterpret_cast<const read_t*>(q_) + threadIdx.x;
+  const data_vec4_t q_thread = *(reinterpret_cast<const data_vec4_t*>(q_) + threadIdx.x);
 
   // Each block computes different B value
   float max_qk_acc = std::numeric_limits<float>::lowest();
@@ -191,7 +188,7 @@ efficient_attention_forward_decoder_ck_kernel(
   // parallelism.
 
   constexpr int32_t kTimeUnroll = 1;
-  const read_t* k_loads[kTimeUnroll];
+  data_vec4_t k_loads[kTimeUnroll];
 
   const int32_t t_max_unroll =
     (t_max / (kWavefrontsPerBlock * kTimeUnroll)) * (kWavefrontsPerBlock * kTimeUnroll);
@@ -204,15 +201,15 @@ efficient_attention_forward_decoder_ck_kernel(
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
       k_loads[ttt] =
-          reinterpret_cast<const read_t*>(k_) + threadIdx.x;
+          *(reinterpret_cast<const data_vec4_t*>(k_) + threadIdx.x);
     }
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       float qk_acc = 0;
       int32_t t = tt + ttt;
 
-      ck::inner_product<data_vec4_t, data_vec4_t, float>(*reinterpret_cast<const data_vec4_t*>(q_thread), 
-                                                         *reinterpret_cast<const data_vec4_t*>(k_loads[ttt]), 
+      ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
+                                                         k_loads[ttt], 
                                                          qk_acc);
       qk_acc *= qk_scale;
 
@@ -236,14 +233,14 @@ efficient_attention_forward_decoder_ck_kernel(
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
       k_loads[ttt] =
-          reinterpret_cast<const read_t*>(k_) + threadIdx.x;
+          *(reinterpret_cast<const data_vec4_t*>(k_) + threadIdx.x);
     }
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       float qk_acc = 0;
       int32_t t = tt + ttt;
-      ck::inner_product<data_vec4_t, data_vec4_t, float>(*reinterpret_cast<const data_vec4_t*>(q_thread), 
-                                                         *reinterpret_cast<const data_vec4_t*>(k_loads[ttt]), 
+      ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
+                                                         k_loads[ttt], 
                                                          qk_acc);
       qk_acc *= qk_scale;
 
@@ -313,13 +310,13 @@ efficient_attention_forward_decoder_ck_kernel(
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
       k_loads[ttt] =
-          reinterpret_cast<const read_t*>(v_) + threadIdx.x;
+          *(reinterpret_cast<const data_vec4_t*>(v_) + threadIdx.x);
       ps[ttt] = smem[t];
     }
 
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
-      o_acc = scalar4_scale_acc<read_t, data_t>(o_acc, k_loads[ttt], ps[ttt]);
+      o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
     }
   }
 
@@ -332,13 +329,13 @@ efficient_attention_forward_decoder_ck_kernel(
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
       k_loads[ttt] =
-          reinterpret_cast<const read_t*>(v_) + threadIdx.x;
+          *(reinterpret_cast<const data_vec4_t*>(v_) + threadIdx.x);
       ps[ttt] = smem[t];
     }
 
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
-      o_acc = scalar4_scale_acc<read_t, data_t>(o_acc, k_loads[ttt], ps[ttt]);
+      o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
     }
   }
   // now, each thread has partial sums. Write to smem and get accumulated

From d8872182a07316a7e886240c204c40aa18db5321 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 17:22:21 -0400
Subject: [PATCH 142/641] refactor loading/storing to separate functions

---
 .../hip_fmha/attention_forward_decoder.cpp    | 44 ++++++++++---------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index cfd8ace15..2ada6ac2d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -130,15 +130,25 @@ scalar4_scale_acc<ck::bhalf4_t>(float4 acc, const ck::bhalf4_t& a, float b) {
 
 template <typename F>
 float
-__device__ wavefrontReduce(float val) {
+__device__ __forceinline__ wavefrontReduce(float val) {
   auto reducer = F();  
 #pragma unroll
-  for (uint mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
+  for (int32_t mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
     val = reducer(val, __shfl_xor(val, mask, kThreadsPerWavefront));
   }
   return val;
 }
 
+template <typename TDataPtr, typename TDataVec>
+__device__ TDataVec load_v(const TDataPtr data_ptr, int32_t vector_offset) {
+  return *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+}
+
+template <typename TDataPtr, typename TDataVec>
+__device__ void store_v(const TDataPtr data_ptr, int32_t vector_offset, TDataVec value) {
+  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+}
+
 template<typename scalar_t>
 __global__ void
 efficient_attention_forward_decoder_ck_kernel(
@@ -178,8 +188,7 @@ efficient_attention_forward_decoder_ck_kernel(
   using read_t = typename c10_to_read_t<scalar_t>::type;
   using data_t = typename c10_to_data_t<scalar_t>::type;
   using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
-  const data_vec4_t q_thread = *(reinterpret_cast<const data_vec4_t*>(q_) + threadIdx.x);
-
+  const data_vec4_t q_thread = load_v<decltype(q_), data_vec4_t>(q_, threadIdx.x);
   // Each block computes different B value
   float max_qk_acc = std::numeric_limits<float>::lowest();
 
@@ -200,8 +209,7 @@ efficient_attention_forward_decoder_ck_kernel(
       int32_t t = tt + ttt;
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
-      k_loads[ttt] =
-          *(reinterpret_cast<const data_vec4_t*>(k_) + threadIdx.x);
+      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, threadIdx.x);
     }
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
@@ -232,8 +240,7 @@ efficient_attention_forward_decoder_ck_kernel(
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
-      k_loads[ttt] =
-          *(reinterpret_cast<const data_vec4_t*>(k_) + threadIdx.x);
+      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, threadIdx.x);
     }
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
@@ -309,8 +316,8 @@ efficient_attention_forward_decoder_ck_kernel(
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
-      k_loads[ttt] =
-          *(reinterpret_cast<const data_vec4_t*>(v_) + threadIdx.x);
+      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, threadIdx.x);
+
       ps[ttt] = smem[t];
     }
 
@@ -328,8 +335,8 @@ efficient_attention_forward_decoder_ck_kernel(
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
-      k_loads[ttt] =
-          *(reinterpret_cast<const data_vec4_t*>(v_) + threadIdx.x);
+      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, threadIdx.x);
+
       ps[ttt] = smem[t];
     }
 
@@ -342,29 +349,26 @@ efficient_attention_forward_decoder_ck_kernel(
   // results back.
   __syncthreads();
 
-  *(reinterpret_cast<float4*>(smem) + wavefront_idx * kThreadsPerWavefront +
-    threadIdx.x) = o_acc;
+  store_v<float*, float4>(smem, wavefront_idx * kThreadsPerWavefront +
+    threadIdx.x, o_acc);
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
     float4 r = make_float4(0, 0, 0, 0);
     for (int32_t w = 0; w < kWavefrontsPerBlock; ++w) {
-      auto partial_r = *(
-          reinterpret_cast<float4*>(smem) + w * kThreadsPerWavefront + threadIdx.x);
+      auto partial_r = load_v<float*, float4>(smem, w * kThreadsPerWavefront + threadIdx.x);
       r.x += partial_r.x;
       r.y += partial_r.y;
       r.z += partial_r.z;
       r.w += partial_r.w;
     }
     // write output D row
-    auto* o_ = reinterpret_cast<read_t*>(&O[b][0][h][0]);
-    typename c10_to_data_t<scalar_t>::vec4 bf_r;
+    data_vec4_t bf_r;
     bf_r.x = r.x;
     bf_r.y = r.y;
     bf_r.z = r.z;
     bf_r.w = r.w;
-    o_[threadIdx.x] =
-        *reinterpret_cast<const read_t*>(&bf_r);
+    store_v<decltype(&O[b][0][h][0]), data_vec4_t>(&O[b][0][h][0], threadIdx.x, bf_r);
   }
 }
 

From 1e3b9cbd7f76a8349d8f0c4176da1cee0669404a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 19:01:03 -0400
Subject: [PATCH 143/641] remove references to read_t as we use ck vectors now
 instead of primitive vector types

---
 .../hip_fmha/attention_forward_decoder.cpp    | 28 +++----------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 2ada6ac2d..15dcda3f1 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -51,25 +51,6 @@ constexpr int32_t kWavefrontsPerBlock = 16;
 constexpr int32_t D_H = 256;
 constexpr int32_t T_MAX = 8192;
 
-// read 4 elements in one instruction
-template<typename c10_t>
-struct c10_to_read_t;
-
-template<>
-struct c10_to_read_t<float> {
-  using type = uint4;
-};
-
-template<>
-struct c10_to_read_t<c10::Half> {
-  using type = uint2;
-};
-
-template<>
-struct c10_to_read_t<c10::BFloat16> {
-  using type = uint2;
-};
-
 template<typename c10_t>
 struct c10_to_data_t;
 
@@ -140,12 +121,12 @@ __device__ __forceinline__ wavefrontReduce(float val) {
 }
 
 template <typename TDataPtr, typename TDataVec>
-__device__ TDataVec load_v(const TDataPtr data_ptr, int32_t vector_offset) {
+__device__ TDataVec load_v(TDataPtr data_ptr, int32_t vector_offset) {
   return *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TDataPtr, typename TDataVec>
-__device__ void store_v(const TDataPtr data_ptr, int32_t vector_offset, TDataVec value) {
+__device__ void store_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec value) {
   *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
@@ -176,16 +157,15 @@ efficient_attention_forward_decoder_ck_kernel(
 
   int32_t wavefront_idx = threadIdx.y;
   // need kWavefrontsPerBlock == blockDim.y;
-  // Need D_H == 128
+  // Need D_H == 256
   const auto* q_ = &(XQ[b][0][h][0]);
 
-  bool multiquery = cache_K.size(2) == 1;
+  const bool multiquery = cache_K.size(2) == 1;
   auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
   auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
-  using read_t = typename c10_to_read_t<scalar_t>::type;
   using data_t = typename c10_to_data_t<scalar_t>::type;
   using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
   const data_vec4_t q_thread = load_v<decltype(q_), data_vec4_t>(q_, threadIdx.x);

From 9446d2335b9ae6f730c3f750905a6859c955e1b2 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 28 Sep 2023 19:02:50 -0400
Subject: [PATCH 144/641] comment about input dimension change

---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 15dcda3f1..05859ece1 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -157,7 +157,7 @@ efficient_attention_forward_decoder_ck_kernel(
 
   int32_t wavefront_idx = threadIdx.y;
   // need kWavefrontsPerBlock == blockDim.y;
-  // Need D_H == 256
+  // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
   const auto* q_ = &(XQ[b][0][h][0]);
 
   const bool multiquery = cache_K.size(2) == 1;

From 923511cc5976d909de7a1bb0dbb2eadf76541cdd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 29 Sep 2023 13:57:24 -0400
Subject: [PATCH 145/641] stick with ck vector types; add missing type
 conversions in a ccouple of places; 5->14 tests passing out of 72

---
 .../hip_fmha/attention_forward_decoder.cpp    | 40 ++++++++-----------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 05859ece1..56fce0788 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -74,23 +74,20 @@ struct c10_to_data_t<c10::BFloat16> {
 
 template<typename data4_t>
 __device__
-float4 scalar4_scale_acc(float4 acc, const data4_t& a, float b);
+ck::float4_t scalar4_scale_acc(ck::float4_t acc, data4_t a, float b);
 
 template<>
 __device__  
-float4
-scalar4_scale_acc<ck::float4_t>(float4 acc, const ck::float4_t& a, float b) {
-  acc.x += a.x * b;
-  acc.y += a.y * b;
-  acc.z += a.z * b;
-  acc.w += a.w * b;
+ck::float4_t
+scalar4_scale_acc<ck::float4_t>(ck::float4_t acc, ck::float4_t a, float b) {
+  acc = acc + a * b;
   return acc;
 }
 
 template<>
 __device__  
-float4
-scalar4_scale_acc<ck::half4_t>(float4 acc, const ck::half4_t& a, float b) {
+ck::float4_t
+scalar4_scale_acc<ck::half4_t>(ck::float4_t acc, ck::half4_t a, float b) {
   acc.x += ck::type_convert<float>(a.x) * b;
   acc.y += ck::type_convert<float>(a.y) * b;
   acc.z += ck::type_convert<float>(a.z) * b;
@@ -100,8 +97,8 @@ scalar4_scale_acc<ck::half4_t>(float4 acc, const ck::half4_t& a, float b) {
 
 template<>
 __device__  
-float4
-scalar4_scale_acc<ck::bhalf4_t>(float4 acc, const ck::bhalf4_t& a, float b) {
+ck::float4_t
+scalar4_scale_acc<ck::bhalf4_t>(ck::float4_t acc, ck::bhalf4_t a, float b) {
   acc.x += ck::type_convert<float>(a.x) * b;
   acc.y += ck::type_convert<float>(a.y) * b;
   acc.z += ck::type_convert<float>(a.z) * b;
@@ -287,7 +284,7 @@ efficient_attention_forward_decoder_ck_kernel(
   // outputs are of size float[D]
 
   float ps[kTimeUnroll];
-  float4 o_acc = make_float4(0, 0, 0, 0);
+  ck::float4_t o_acc = 0;
   for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
        tt += kWavefrontsPerBlock * kTimeUnroll) {
 #pragma unroll kTimeUnroll
@@ -329,25 +326,22 @@ efficient_attention_forward_decoder_ck_kernel(
   // results back.
   __syncthreads();
 
-  store_v<float*, float4>(smem, wavefront_idx * kThreadsPerWavefront +
+  store_v<float*, ck::float4_t>(smem, wavefront_idx * kThreadsPerWavefront +
     threadIdx.x, o_acc);
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
-    float4 r = make_float4(0, 0, 0, 0);
+    ck::float4_t r = 0;
     for (int32_t w = 0; w < kWavefrontsPerBlock; ++w) {
-      auto partial_r = load_v<float*, float4>(smem, w * kThreadsPerWavefront + threadIdx.x);
-      r.x += partial_r.x;
-      r.y += partial_r.y;
-      r.z += partial_r.z;
-      r.w += partial_r.w;
+      auto partial_r = load_v<float*, ck::float4_t>(smem, w * kThreadsPerWavefront + threadIdx.x);
+      r += partial_r;
     }
     // write output D row
     data_vec4_t bf_r;
-    bf_r.x = r.x;
-    bf_r.y = r.y;
-    bf_r.z = r.z;
-    bf_r.w = r.w;
+    bf_r.x = ck::type_convert<data_t>(r.x);
+    bf_r.y = ck::type_convert<data_t>(r.y);
+    bf_r.z = ck::type_convert<data_t>(r.z);
+    bf_r.w = ck::type_convert<data_t>(r.w);
     store_v<decltype(&O[b][0][h][0]), data_vec4_t>(&O[b][0][h][0], threadIdx.x, bf_r);
   }
 }

From da6457e84431d886f44f72b9511c251005d094bf Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 30 Sep 2023 00:01:45 -0400
Subject: [PATCH 146/641] modify reference attn to accept
 dtype.to(dtype=dtype); make decoder test identifiers more verbose

---
 tests/test_mem_eff_attention_ck.py            | 30 ++++++++++---------
 .../hip_fmha/attention_forward_decoder.cpp    | 14 ++++-----
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index c4240d21c..528cd0953 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -208,15 +208,17 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
 )
 
 
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
     if q.ndim == 4:
         assert p == 0.0
         return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
 
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
+    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
     q = q * scale
 
     attn = q @ k.transpose(-2, -1)
@@ -226,16 +228,16 @@ def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
             attn_bias_tensor = attn_bias.materialize(
                 (q.shape[0], 1, q.shape[1], k.shape[1]),
                 device=q.device,
-                dtype=torch.float32,
+                dtype=dtype,
             )
         else:
-            attn_bias_tensor = attn_bias
+            attn_bias_tensor = attn_bias.to(dtype=dtype)
         if attn_bias_tensor.ndim == 4:
             assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
             attn_bias_tensor = attn_bias_tensor.reshape(
                 [-1, *attn_bias_tensor.shape[2:]]
             )
-        attn = attn + attn_bias_tensor.float()
+        attn = attn + attn_bias_tensor
     attn = attn.softmax(-1)
     if drop_mask is not None:
         attn = attn * (drop_mask / (1 - p))
@@ -1619,10 +1621,10 @@ def test_attn_bias_padded() -> None:
 
 
 @pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
-@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "")
-@pytest.mark.parametrize("n_heads", [1, 16, 32])
-@pytest.mark.parametrize("padding", [32, 4096])
-@pytest.mark.parametrize("bsz", [1, 8])
+@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "nomq")
+@pytest.mark.parametrize("n_heads", [1, 16, 32], ids=lambda x: f"nh={x}")
+@pytest.mark.parametrize("padding", [32, 4096], ids=lambda x: f"pad={x}")
+@pytest.mark.parametrize("bsz", [1, 8], ids=lambda x: f"bsz={x}")
 @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
 def test_decoder(
     op, multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str
@@ -1658,11 +1660,11 @@ def test_decoder(
         q, k, v, attn_bias, op=op
     )
     
-    ref_output = ref_attention(q, k, v, attn_bias)
+    ref_output = ref_attention(q, k, v, attn_bias, dtype=dtype_)
 
     assert_allclose(
         decoder_output.float(),
-        ref_output,
+        ref_output.float(),
         atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
         rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
     )
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 56fce0788..fd805d371 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -135,7 +135,7 @@ efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V,
     at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O,
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
-    float qk_scale
+    const float qk_scale
 ) {
   static_assert(4 * kThreadsPerWavefront == D_H, "");
   static_assert(kWavefrontsPerBlock <= kThreadsPerWavefront, "");
@@ -145,15 +145,15 @@ efficient_attention_forward_decoder_ck_kernel(
   extern __shared__ __align__(16) float smem[];
 
   // Each block handles a single batch and head
-  int32_t b = blockIdx.x;
-  int32_t h = blockIdx.y;
+  const int32_t b = blockIdx.x;
+  const int32_t h = blockIdx.y;
 
   // Note: this is decoding case where we attend to current and all previous
   // tokens.
-  int32_t t_max = seq_positions[b] + seq_positions_shift;
+  const int32_t t_max = seq_positions[b] + seq_positions_shift;
 
+  // blockDim.x = kThreadsPerWavefront, blockDim.y = kWavefrontsPerBlock
   int32_t wavefront_idx = threadIdx.y;
-  // need kWavefrontsPerBlock == blockDim.y;
   // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
   const auto* q_ = &(XQ[b][0][h][0]);
 
@@ -253,7 +253,7 @@ efficient_attention_forward_decoder_ck_kernel(
   float softmax_denominator = 0.0f;
   for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
        t += kWavefrontsPerBlock * kThreadsPerWavefront) {
-    softmax_denominator += __expf(smem[t] - max_qk_acc);
+    softmax_denominator += expf(smem[t] - max_qk_acc);
   }
   softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
 
@@ -273,7 +273,7 @@ efficient_attention_forward_decoder_ck_kernel(
   // now, compute the normalization across all threads.
   for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
        t += kWavefrontsPerBlock * kThreadsPerWavefront) {
-    smem[t] = __expf(smem[t] - max_qk_acc) / softmax_denominator;
+    smem[t] = expf(smem[t] - max_qk_acc) / softmax_denominator;
   }
   __syncthreads();
 

From e8a602bb015742fd47d929a92315893a6893bdfc Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 2 Oct 2023 20:43:03 -0400
Subject: [PATCH 147/641] make tests pass by setting each block contain only 1
 wavefront; tbd: figure out how to make multiple wavefronts per block work

---
 .../hip_fmha/attention_forward_decoder.cpp    | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index fd805d371..54e13a4c6 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -47,7 +47,7 @@ __device__ void inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, cons
 namespace {
 
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock = 16;
+constexpr int32_t kWavefrontsPerBlock = 1;
 constexpr int32_t D_H = 256;
 constexpr int32_t T_MAX = 8192;
 
@@ -80,8 +80,7 @@ template<>
 __device__  
 ck::float4_t
 scalar4_scale_acc<ck::float4_t>(ck::float4_t acc, ck::float4_t a, float b) {
-  acc = acc + a * b;
-  return acc;
+  return acc + a * b;
 }
 
 template<>
@@ -176,14 +175,16 @@ efficient_attention_forward_decoder_ck_kernel(
   constexpr int32_t kTimeUnroll = 1;
   data_vec4_t k_loads[kTimeUnroll];
 
+  const auto dtt = kWavefrontsPerBlock * kTimeUnroll;
   const int32_t t_max_unroll =
-    (t_max / (kWavefrontsPerBlock * kTimeUnroll)) * (kWavefrontsPerBlock * kTimeUnroll);
+    (t_max / dtt) * dtt;
 
   for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
-       tt += kWavefrontsPerBlock * kTimeUnroll) {
+       tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       int32_t t = tt + ttt;
+      // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
       k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, threadIdx.x);
@@ -269,7 +270,7 @@ efficient_attention_forward_decoder_ck_kernel(
     softmax_denominator = smem[T_MAX + threadIdx.x];
   }
   softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
-
+  
   // now, compute the normalization across all threads.
   for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
        t += kWavefrontsPerBlock * kThreadsPerWavefront) {
@@ -286,7 +287,7 @@ efficient_attention_forward_decoder_ck_kernel(
   float ps[kTimeUnroll];
   ck::float4_t o_acc = 0;
   for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
-       tt += kWavefrontsPerBlock * kTimeUnroll) {
+       tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       int32_t t = tt + ttt;
@@ -326,8 +327,9 @@ efficient_attention_forward_decoder_ck_kernel(
   // results back.
   __syncthreads();
 
-  store_v<float*, ck::float4_t>(smem, wavefront_idx * kThreadsPerWavefront +
+  store_v<float*, ck::float4_t>(&smem[0], wavefront_idx * kThreadsPerWavefront +
     threadIdx.x, o_acc);
+
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
@@ -342,7 +344,8 @@ efficient_attention_forward_decoder_ck_kernel(
     bf_r.y = ck::type_convert<data_t>(r.y);
     bf_r.z = ck::type_convert<data_t>(r.z);
     bf_r.w = ck::type_convert<data_t>(r.w);
-    store_v<decltype(&O[b][0][h][0]), data_vec4_t>(&O[b][0][h][0], threadIdx.x, bf_r);
+    auto* o_ = &O[b][0][h][0];
+    store_v<decltype(o_), data_vec4_t>(o_, threadIdx.x, bf_r);
   }
 }
 

From 19a5bf768c022971a9aafffae95549c7810809d9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 4 Oct 2023 16:16:31 -0400
Subject: [PATCH 148/641] modify test decoder to match the upstream test cases

---
 tests/test_mem_eff_attention_ck.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 528cd0953..71aed5445 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1622,9 +1622,8 @@ def test_attn_bias_padded() -> None:
 
 @pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
 @pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "nomq")
-@pytest.mark.parametrize("n_heads", [1, 16, 32], ids=lambda x: f"nh={x}")
+@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)], ids=lambda x: f"bsz-nh={x}")
 @pytest.mark.parametrize("padding", [32, 4096], ids=lambda x: f"pad={x}")
-@pytest.mark.parametrize("bsz", [1, 8], ids=lambda x: f"bsz={x}")
 @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
 def test_decoder(
     op, multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str

From 49a305325b846e5647c665fba7ac757493305fef Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 6 Oct 2023 19:44:26 -0400
Subject: [PATCH 149/641] add a cpp helper for debugging

---
 .../hip_fmha/attention_forward_decoder.cpp    | 159 ++++++++++++++----
 1 file changed, 122 insertions(+), 37 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 54e13a4c6..bf9457459 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -56,7 +56,7 @@ struct c10_to_data_t;
 
 template<>
 struct c10_to_data_t<float> {
-    using type = float_t;
+    using type = float;
     using vec4 = ck::float4_t;
 };
 
@@ -151,20 +151,25 @@ efficient_attention_forward_decoder_ck_kernel(
   // tokens.
   const int32_t t_max = seq_positions[b] + seq_positions_shift;
 
-  // blockDim.x = kThreadsPerWavefront, blockDim.y = kWavefrontsPerBlock
-  int32_t wavefront_idx = threadIdx.y;
+  const int32_t lane_idx = threadIdx.x;
+  const int32_t wavefront_idx = threadIdx.y;
+  const int32_t threads_per_wavefront = blockDim.x;
+  const int32_t wavefronts_per_block = blockDim.y;
+  const int32_t threads_per_block = threads_per_wavefront * wavefronts_per_block;
+  const int32_t thread_linear_idx = lane_idx + wavefront_idx * threads_per_wavefront;
+
   // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
   const auto* q_ = &(XQ[b][0][h][0]);
 
   const bool multiquery = cache_K.size(2) == 1;
-  auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
-  auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
+  const auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
+  const auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
   using data_t = typename c10_to_data_t<scalar_t>::type;
   using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
-  const data_vec4_t q_thread = load_v<decltype(q_), data_vec4_t>(q_, threadIdx.x);
+  const data_vec4_t q_thread = load_v<decltype(q_), data_vec4_t>(q_, lane_idx);
   // Each block computes different B value
   float max_qk_acc = std::numeric_limits<float>::lowest();
 
@@ -175,19 +180,18 @@ efficient_attention_forward_decoder_ck_kernel(
   constexpr int32_t kTimeUnroll = 1;
   data_vec4_t k_loads[kTimeUnroll];
 
-  const auto dtt = kWavefrontsPerBlock * kTimeUnroll;
+  const auto dtt = wavefronts_per_block * kTimeUnroll;
   const int32_t t_max_unroll =
     (t_max / dtt) * dtt;
 
-  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
-       tt += dtt) {
+  for (auto tt = wavefront_idx; tt < t_max_unroll; tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       int32_t t = tt + ttt;
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
-      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, threadIdx.x);
+      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
     }
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
@@ -203,7 +207,7 @@ efficient_attention_forward_decoder_ck_kernel(
       max_qk_acc = max(qk_acc, max_qk_acc);
 
       // write accumulated sums to smem.
-      if (threadIdx.x == 0) {
+      if (lane_idx == 0) {
         smem[t] = qk_acc;
       }
     }
@@ -211,14 +215,14 @@ efficient_attention_forward_decoder_ck_kernel(
 
   constexpr int32_t kTimeUnroll1 = 1;
   for (auto tt = t_max_unroll + wavefront_idx; tt < t_max;
-       tt += kWavefrontsPerBlock * kTimeUnroll1) {
+       tt += wavefronts_per_block * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       int32_t t = tt + ttt;
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
-      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, threadIdx.x);
+      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
     }
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
@@ -233,7 +237,7 @@ efficient_attention_forward_decoder_ck_kernel(
       max_qk_acc = max(qk_acc, max_qk_acc);
 
       // write accumulated sums to smem.
-      if (threadIdx.x == 0) {
+      if (lane_idx == 0) {
         smem[t] = qk_acc;
       }
     }
@@ -241,39 +245,37 @@ efficient_attention_forward_decoder_ck_kernel(
 
   // Use shared reduction to compute max and compute softmax on shared memory.
   // write max acc
-  if (threadIdx.x == 0) {
+  if (lane_idx == 0) {
     smem[T_MAX + wavefront_idx] = max_qk_acc;
   }
   __syncthreads();
-  if (threadIdx.x < kWavefrontsPerBlock) {
-    max_qk_acc = max(max_qk_acc, smem[T_MAX + threadIdx.x]);
+  if (lane_idx < wavefronts_per_block) {
+    max_qk_acc = max(max_qk_acc, smem[T_MAX + lane_idx]);
   }
   // shared across all threads in block
   max_qk_acc = wavefrontReduce<std::greater<float>>(max_qk_acc);
   // each wavefront computes partial sum of exp.
   float softmax_denominator = 0.0f;
-  for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
-       t += kWavefrontsPerBlock * kThreadsPerWavefront) {
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     softmax_denominator += expf(smem[t] - max_qk_acc);
   }
   softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
 
   __syncthreads();
-  if (threadIdx.x == 0) {
+  if (lane_idx == 0) {
     smem[T_MAX + wavefront_idx] = softmax_denominator;
   }
   __syncthreads();
   
   // now, compute sum of exp(x - max(x)) over all intermediate results.
   softmax_denominator = 0.0;
-  if (threadIdx.x < kWavefrontsPerBlock) {
-    softmax_denominator = smem[T_MAX + threadIdx.x];
+  if (lane_idx < wavefronts_per_block) {
+    softmax_denominator = smem[T_MAX + lane_idx];
   }
   softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
   
   // now, compute the normalization across all threads.
-  for (int32_t t = threadIdx.x + wavefront_idx * kThreadsPerWavefront; t < t_max;
-       t += kWavefrontsPerBlock * kThreadsPerWavefront) {
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     smem[t] = expf(smem[t] - max_qk_acc) / softmax_denominator;
   }
   __syncthreads();
@@ -286,15 +288,14 @@ efficient_attention_forward_decoder_ck_kernel(
 
   float ps[kTimeUnroll];
   ck::float4_t o_acc = 0;
-  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll;
-       tt += dtt) {
+  for (auto tt = wavefront_idx; tt < t_max_unroll; tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       int32_t t = tt + ttt;
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
-      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, threadIdx.x);
+      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, lane_idx);
 
       ps[ttt] = smem[t];
     }
@@ -305,15 +306,14 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
-  for (auto tt = t_max_unroll + wavefront_idx; tt < t_max;
-       tt += kWavefrontsPerBlock * kTimeUnroll1) {
+  for (auto tt = t_max_unroll + wavefront_idx; tt < t_max; tt += wavefronts_per_block * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       int32_t t = tt + ttt;
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
-      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, threadIdx.x);
+      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, lane_idx);
 
       ps[ttt] = smem[t];
     }
@@ -326,16 +326,16 @@ efficient_attention_forward_decoder_ck_kernel(
   // now, each thread has partial sums. Write to smem and get accumulated
   // results back.
   __syncthreads();
-
-  store_v<float*, ck::float4_t>(&smem[0], wavefront_idx * kThreadsPerWavefront +
-    threadIdx.x, o_acc);
+  
+  // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock 
+  store_v<float*, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
 
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
     ck::float4_t r = 0;
-    for (int32_t w = 0; w < kWavefrontsPerBlock; ++w) {
-      auto partial_r = load_v<float*, ck::float4_t>(smem, w * kThreadsPerWavefront + threadIdx.x);
+    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
+      auto partial_r = load_v<float*, ck::float4_t>(smem, w * threads_per_wavefront + lane_idx);
       r += partial_r;
     }
     // write output D row
@@ -345,7 +345,7 @@ efficient_attention_forward_decoder_ck_kernel(
     bf_r.z = ck::type_convert<data_t>(r.z);
     bf_r.w = ck::type_convert<data_t>(r.w);
     auto* o_ = &O[b][0][h][0];
-    store_v<decltype(o_), data_vec4_t>(o_, threadIdx.x, bf_r);
+    store_v<decltype(o_), data_vec4_t>(o_, lane_idx, bf_r);
   }
 }
 
@@ -422,4 +422,89 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
       TORCH_FN(efficient_attention_forward_decoder_ck));
-}
\ No newline at end of file
+}
+
+#ifdef ATTN_FWD_DECODER_MAIN
+
+#include <torch/torch.h>
+
+/*
+
+(1) hipify
+ > pip install -e /xformers
+(2) compile
+ > /opt/rocm/bin/hipcc \
+-I/xformers/xformers/csrc \
+-I/xformers/xformers/csrc/attention/hip_fmha \
+-I/xformers/third_party/composable_kernel/include \
+-I/xformers/third_party/composable_kernel/include/ck \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/TH \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THC \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THH \
+-I/opt/rocm/include \
+-I/opt/conda/envs/py_3.8/include/python3.8 \
+-L/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
+-L/opt/conda/envs/py_3.8/lib \
+-L/opt/rocm/lib \
+-L/opt/rocm/hip/lib \
+-fPIC \
+-D__HIP_PLATFORM_HCC__=1 \
+-DATTN_FWD_DECODER_MAIN \
+-DUSE_ROCM=1 \
+-DCUDA_HAS_FP16=1 \
+-D__HIP_NO_HALF_OPERATORS__=1 \
+-D__HIP_NO_HALF_CONVERSIONS__=1 \
+-O3 \
+-std=c++17 \
+--offload-arch=gfx90a \
+-U__CUDA_NO_HALF_OPERATORS__ \
+-U__CUDA_NO_HALF_CONVERSIONS__ \
+-DBUILD_PYTHON_PACKAGE \
+-DTORCH_API_INCLUDE_EXTENSION_H \
+'-DPYBIND11_COMPILER_TYPE="_gcc"' \
+'-DPYBIND11_STDLIB="_libstdcpp"' \
+'-DPYBIND11_BUILD_ABI="_cxxabi1013"' \
+-DTORCH_EXTENSION_NAME=_C \
+-D_GLIBCXX_USE_CXX11_ABI=1 \
+-fno-gpu-rdc \
+/xformers/xformers/csrc/attention/hip_fmha/attention_forward_decoder.hip \
+-lc10_hip \
+-ltorch_hip \
+-lc10 \
+-ltorch \
+-ltorch_cpu \
+-ltorch_python \
+-lpython3.8 \
+-lamdhip64 \
+-o a.out
+
+(3) run
+ > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib ./a.out
+*/
+
+int main(int argc, char** argv) {
+  const int32_t D = 256;
+  const int32_t B = 4;
+  const int32_t H = 8;
+  auto options = torch::TensorOptions()
+    .dtype(torch::kFloat32)
+    .layout(torch::kStrided)
+    .device(torch::kCUDA, 1)
+    .requires_grad(false);
+  auto int_options = options.dtype(torch::kInt);
+  auto XQ = at::randn({B, 1, H, D}, options);
+  auto K = at::randn({B, T_MAX, H, D}, options);
+  auto V = at::randn({B, T_MAX, H, D}, options);
+  auto seq = at::randint(1, 32, {B}, int_options);
+  double qk_scale = sqrt(D);
+
+  auto result = efficient_attention_forward_decoder_ck(XQ, K, V, seq, qk_scale);
+  return 0;
+}
+
+#endif // MAIN
\ No newline at end of file

From 68d93d79dc422c10fe50c005c6871cee66391259 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 13:22:47 -0400
Subject: [PATCH 150/641] add cpp repro to debug numerical mismatch

---
 .../hip_fmha/attention_forward_decoder.cpp    | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index bf9457459..3e79f0d3d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -360,8 +360,9 @@ efficient_attention_forward_decoder_ck_kernel(
       NAME,                                                         \
       AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
 
+template<int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor
-efficient_attention_forward_decoder_ck(
+efficient_attention_forward_decoder_ck_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
@@ -382,10 +383,10 @@ efficient_attention_forward_decoder_ck(
   auto B = XQ.size(0);
   auto H = XQ.size(2);
   dim3 blocks(B, H);
-  dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
+  dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
-  int32_t smem_softmax = T_MAX * sizeof(float) + kWavefrontsPerBlock * sizeof(float);
-  int32_t smem_output = D_H * sizeof(float) * kWavefrontsPerBlock;
+  int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = D_H * sizeof(float) * threads.y;
   int32_t smem = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
 
@@ -416,6 +417,17 @@ efficient_attention_forward_decoder_ck(
 #undef AT_DISPATCH_CASE_3
 #undef AT_DISPATCH_SWITCH_3
 
+at::Tensor
+efficient_attention_forward_decoder_ck(
+    const at::Tensor& XQ, // [B, 1, H, D]
+    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+    const at::Tensor& seq_positions, // [B]
+    double qk_scale) {
+  return efficient_attention_forward_decoder_ck_impl<kThreadsPerWavefront, kWavefrontsPerBlock> (
+    XQ, cache_K, cache_V, seq_positions, qk_scale
+  );
+}
 } // namespace
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
@@ -501,9 +513,13 @@ int main(int argc, char** argv) {
   auto K = at::randn({B, T_MAX, H, D}, options);
   auto V = at::randn({B, T_MAX, H, D}, options);
   auto seq = at::randint(1, 32, {B}, int_options);
-  double qk_scale = sqrt(D);
+  double qk_scale = 1. / sqrt(D);
 
-  auto result = efficient_attention_forward_decoder_ck(XQ, K, V, seq, qk_scale);
+  auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(XQ, K, V, seq, qk_scale);
+  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(XQ, K, V, seq, qk_scale);
+  auto mask = at::isclose(result, gold_result, 1e-2, 1e-2, false);
+  auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+  printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
   return 0;
 }
 

From 04ab7d0f254b4bfb529ca4dc061fe4a10066cb24 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 14:26:47 -0400
Subject: [PATCH 151/641] clean up kernel invocation; mark const indices const

---
 .../hip_fmha/attention_forward_decoder.cpp    | 47 ++++++++++++-------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 3e79f0d3d..19b2f5162 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -187,7 +187,7 @@ efficient_attention_forward_decoder_ck_kernel(
   for (auto tt = wavefront_idx; tt < t_max_unroll; tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
-      int32_t t = tt + ttt;
+      const int32_t t = tt + ttt;
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
@@ -196,7 +196,7 @@ efficient_attention_forward_decoder_ck_kernel(
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       float qk_acc = 0;
-      int32_t t = tt + ttt;
+      const int32_t t = tt + ttt;
 
       ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
                                                          k_loads[ttt], 
@@ -218,7 +218,7 @@ efficient_attention_forward_decoder_ck_kernel(
        tt += wavefronts_per_block * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
-      int32_t t = tt + ttt;
+      const int32_t t = tt + ttt;
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
@@ -227,7 +227,7 @@ efficient_attention_forward_decoder_ck_kernel(
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       float qk_acc = 0;
-      int32_t t = tt + ttt;
+      const int32_t t = tt + ttt;
       ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
                                                          k_loads[ttt], 
                                                          qk_acc);
@@ -291,7 +291,7 @@ efficient_attention_forward_decoder_ck_kernel(
   for (auto tt = wavefront_idx; tt < t_max_unroll; tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
-      int32_t t = tt + ttt;
+      const int32_t t = tt + ttt;
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
@@ -309,7 +309,7 @@ efficient_attention_forward_decoder_ck_kernel(
   for (auto tt = t_max_unroll + wavefront_idx; tt < t_max; tt += wavefronts_per_block * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
-      int32_t t = tt + ttt;
+      const int32_t t = tt + ttt;
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
@@ -349,6 +349,24 @@ efficient_attention_forward_decoder_ck_kernel(
   }
 }
 
+void update_max_dynamic_shared_memory_size_bytes(void* kernel_func, int32_t new_value) {
+  hipFuncAttributes attributes;
+  C10_CUDA_CHECK(hipFuncGetAttributes(
+      &attributes, 
+      kernel_func));
+
+  const auto default_value = attributes.maxDynamicSharedSizeBytes;
+
+  // printf("Default smem size: %d\n", default_value);
+
+  if (new_value > default_value) {
+    C10_CUDA_CHECK(hipFuncSetAttribute(
+        kernel_func,
+        hipFuncAttributeMaxDynamicSharedMemorySize,
+        new_value));
+  }
+}
+
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
   AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
   AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
@@ -386,21 +404,16 @@ efficient_attention_forward_decoder_ck_impl(
   dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
   int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = D_H * sizeof(float) * threads.y;
-  int32_t smem = max(smem_softmax, smem_output);
+  int32_t smem_output = D_H * sizeof(float) * threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  int32_t smem_size = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
 
   AT_DISPATCH_SWITCH_3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Float, 
     XQ.scalar_type(), "efficient_attention_forward_decoder_ck", [&] {
       auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
-      if (smem > 48 * 1024) {
-        C10_CUDA_CHECK(hipFuncSetAttribute(
-            reinterpret_cast<void*&>(kernel),
-            hipFuncAttributeMaxDynamicSharedMemorySize,
-            smem));
-      }
+      update_max_dynamic_shared_memory_size_bytes(reinterpret_cast<void*&>(kernel), smem_size);
       kernel
-          <<<blocks, threads, smem, stream>>>(
+          <<<blocks, threads, smem_size, stream>>>(
               XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
               cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
               cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
@@ -510,8 +523,8 @@ int main(int argc, char** argv) {
     .requires_grad(false);
   auto int_options = options.dtype(torch::kInt);
   auto XQ = at::randn({B, 1, H, D}, options);
-  auto K = at::randn({B, T_MAX, H, D}, options);
-  auto V = at::randn({B, T_MAX, H, D}, options);
+  auto K = at::randn({B, T_MAX / 2, H, D}, options);
+  auto V = at::randn({B, T_MAX / 2, H, D}, options);
   auto seq = at::randint(1, 32, {B}, int_options);
   double qk_scale = 1. / sqrt(D);
 

From 7674da23927d1215578e2ab2d01a53a6332f029f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 15:01:22 -0400
Subject: [PATCH 152/641] fix a reduction bug

---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 19b2f5162..46437f72b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -111,7 +111,7 @@ __device__ __forceinline__ wavefrontReduce(float val) {
   auto reducer = F();  
 #pragma unroll
   for (int32_t mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
-    val = reducer(val, __shfl_xor(val, mask, kThreadsPerWavefront));
+    val = reducer(__shfl_xor(val, mask, kThreadsPerWavefront), val);
   }
   return val;
 }
@@ -254,6 +254,7 @@ efficient_attention_forward_decoder_ck_kernel(
   }
   // shared across all threads in block
   max_qk_acc = wavefrontReduce<std::greater<float>>(max_qk_acc);
+  
   // each wavefront computes partial sum of exp.
   float softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
@@ -517,7 +518,7 @@ int main(int argc, char** argv) {
   const int32_t B = 4;
   const int32_t H = 8;
   auto options = torch::TensorOptions()
-    .dtype(torch::kFloat32)
+    .dtype(torch::kFloat16)
     .layout(torch::kStrided)
     .device(torch::kCUDA, 1)
     .requires_grad(false);
@@ -529,7 +530,7 @@ int main(int argc, char** argv) {
   double qk_scale = 1. / sqrt(D);
 
   auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(XQ, K, V, seq, qk_scale);
-  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(XQ, K, V, seq, qk_scale);
+  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 16>(XQ, K, V, seq, qk_scale);
   auto mask = at::isclose(result, gold_result, 1e-2, 1e-2, false);
   auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
   printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());

From 5b89fa1e17010ee84c050e36f666f64241779f19 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:53:59 -0400
Subject: [PATCH 153/641] fix another bug in reducer; the tests are now passing

---
 .../hip_fmha/attention_forward_decoder.cpp    | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 46437f72b..8cca9521a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -47,7 +47,7 @@ __device__ void inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, cons
 namespace {
 
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock = 1;
+constexpr int32_t kWavefrontsPerBlock = 8;
 constexpr int32_t D_H = 256;
 constexpr int32_t T_MAX = 8192;
 
@@ -107,11 +107,10 @@ scalar4_scale_acc<ck::bhalf4_t>(ck::float4_t acc, ck::bhalf4_t a, float b) {
 
 template <typename F>
 float
-__device__ __forceinline__ wavefrontReduce(float val) {
-  auto reducer = F();  
+__device__ __forceinline__ wavefrontReduce(float val, F f) {
 #pragma unroll
   for (int32_t mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
-    val = reducer(__shfl_xor(val, mask, kThreadsPerWavefront), val);
+    val = f(__shfl_xor(val, mask, kThreadsPerWavefront), val);
   }
   return val;
 }
@@ -203,7 +202,7 @@ efficient_attention_forward_decoder_ck_kernel(
                                                          qk_acc);
       qk_acc *= qk_scale;
 
-      qk_acc = wavefrontReduce<std::plus<float>>(qk_acc);
+      qk_acc = wavefrontReduce(qk_acc, [] (float a, float b) { return a + b; });
       max_qk_acc = max(qk_acc, max_qk_acc);
 
       // write accumulated sums to smem.
@@ -233,7 +232,7 @@ efficient_attention_forward_decoder_ck_kernel(
                                                          qk_acc);
       qk_acc *= qk_scale;
 
-      qk_acc = wavefrontReduce<std::plus<float>>(qk_acc);
+      qk_acc = wavefrontReduce(qk_acc, [] (float a, float b) { return a + b; });
       max_qk_acc = max(qk_acc, max_qk_acc);
 
       // write accumulated sums to smem.
@@ -253,14 +252,14 @@ efficient_attention_forward_decoder_ck_kernel(
     max_qk_acc = max(max_qk_acc, smem[T_MAX + lane_idx]);
   }
   // shared across all threads in block
-  max_qk_acc = wavefrontReduce<std::greater<float>>(max_qk_acc);
-  
+  max_qk_acc = wavefrontReduce(max_qk_acc, [] (float a, float b) { return a > b ? a : b; });
+
   // each wavefront computes partial sum of exp.
   float softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     softmax_denominator += expf(smem[t] - max_qk_acc);
   }
-  softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
+  softmax_denominator = wavefrontReduce(softmax_denominator, [] (float a, float b) { return a + b; });
 
   __syncthreads();
   if (lane_idx == 0) {
@@ -273,8 +272,8 @@ efficient_attention_forward_decoder_ck_kernel(
   if (lane_idx < wavefronts_per_block) {
     softmax_denominator = smem[T_MAX + lane_idx];
   }
-  softmax_denominator = wavefrontReduce<std::plus<float>>(softmax_denominator);
-  
+  softmax_denominator = wavefrontReduce(softmax_denominator, [] (float a, float b) { return a + b; });
+
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     smem[t] = expf(smem[t] - max_qk_acc) / softmax_denominator;
@@ -515,23 +514,23 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 int main(int argc, char** argv) {
   const int32_t D = 256;
-  const int32_t B = 4;
-  const int32_t H = 8;
+  const int32_t B = 1;
+  const int32_t H = 4;
   auto options = torch::TensorOptions()
-    .dtype(torch::kFloat16)
+    .dtype(torch::kFloat32)
     .layout(torch::kStrided)
     .device(torch::kCUDA, 1)
     .requires_grad(false);
   auto int_options = options.dtype(torch::kInt);
   auto XQ = at::randn({B, 1, H, D}, options);
-  auto K = at::randn({B, T_MAX / 2, H, D}, options);
-  auto V = at::randn({B, T_MAX / 2, H, D}, options);
-  auto seq = at::randint(1, 32, {B}, int_options);
+  auto K = at::randn({B, 4096, H, D}, options);
+  auto V = at::randn({B, 4096, H, D}, options);
+  auto seq = at::randint(63, 128, {B}, int_options);
   double qk_scale = 1. / sqrt(D);
 
   auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(XQ, K, V, seq, qk_scale);
-  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 16>(XQ, K, V, seq, qk_scale);
-  auto mask = at::isclose(result, gold_result, 1e-2, 1e-2, false);
+  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(XQ, K, V, seq, qk_scale);
+  auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
   auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
   printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
   return 0;

From 4db9157c339da4d7b33b4b832008c447a38973a1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:16:36 -0400
Subject: [PATCH 154/641] fix loop unroll (1/2)

---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 8cca9521a..2cd2f10bc 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -176,14 +176,14 @@ efficient_attention_forward_decoder_ck_kernel(
   // Split T across wavefronts in a block, unroll loads to expose more
   // parallelism.
 
-  constexpr int32_t kTimeUnroll = 1;
+  constexpr int32_t kTimeUnroll = 2;
   data_vec4_t k_loads[kTimeUnroll];
 
   const auto dtt = wavefronts_per_block * kTimeUnroll;
   const int32_t t_max_unroll =
     (t_max / dtt) * dtt;
 
-  for (auto tt = wavefront_idx; tt < t_max_unroll; tt += dtt) {
+  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll; tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       const int32_t t = tt + ttt;
@@ -288,7 +288,7 @@ efficient_attention_forward_decoder_ck_kernel(
 
   float ps[kTimeUnroll];
   ck::float4_t o_acc = 0;
-  for (auto tt = wavefront_idx; tt < t_max_unroll; tt += dtt) {
+  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll; tt += dtt) {
 #pragma unroll kTimeUnroll
     for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
       const int32_t t = tt + ttt;

From 7ad550f23973bbc9437d0270b887dfd425f31179 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:29:25 -0400
Subject: [PATCH 155/641] partial fix to unroll (2/2)

---
 .../hip_fmha/attention_forward_decoder.cpp    | 59 +++++++++++--------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 2cd2f10bc..e805188da 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -176,7 +176,7 @@ efficient_attention_forward_decoder_ck_kernel(
   // Split T across wavefronts in a block, unroll loads to expose more
   // parallelism.
 
-  constexpr int32_t kTimeUnroll = 2;
+  constexpr int32_t kTimeUnroll = 4;
   data_vec4_t k_loads[kTimeUnroll];
 
   const auto dtt = wavefronts_per_block * kTimeUnroll;
@@ -212,32 +212,36 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
-  constexpr int32_t kTimeUnroll1 = 1;
-  for (auto tt = t_max_unroll + wavefront_idx; tt < t_max;
+  constexpr int32_t kTimeUnroll1 = 4;
+  for (auto tt = t_max_unroll + wavefront_idx * kTimeUnroll1; tt < t_max;
        tt += wavefronts_per_block * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       const int32_t t = tt + ttt;
-      // &(cache_K[b][t][0][0]);
-      auto* k_ = cache_K_base + t * cache_K.stride(1);
-      // scalar4<scalar_t> k_thread;
-      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
+      if (t < t_max) {
+        // &(cache_K[b][t][0][0]);
+        auto* k_ = cache_K_base + t * cache_K.stride(1);
+        // scalar4<scalar_t> k_thread;
+        k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
+      }
     }
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       float qk_acc = 0;
       const int32_t t = tt + ttt;
-      ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
-                                                         k_loads[ttt], 
-                                                         qk_acc);
-      qk_acc *= qk_scale;
-
-      qk_acc = wavefrontReduce(qk_acc, [] (float a, float b) { return a + b; });
-      max_qk_acc = max(qk_acc, max_qk_acc);
-
-      // write accumulated sums to smem.
-      if (lane_idx == 0) {
-        smem[t] = qk_acc;
+      if (t < t_max) {
+        ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
+                                                          k_loads[ttt], 
+                                                          qk_acc);
+        qk_acc *= qk_scale;
+
+        qk_acc = wavefrontReduce(qk_acc, [] (float a, float b) { return a + b; });
+        max_qk_acc = max(qk_acc, max_qk_acc);
+
+        // write accumulated sums to smem.
+        if (lane_idx == 0) {
+          smem[t] = qk_acc;
+        }
       }
     }
   }
@@ -306,21 +310,26 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
-  for (auto tt = t_max_unroll + wavefront_idx; tt < t_max; tt += wavefronts_per_block * kTimeUnroll1) {
+  for (auto tt = t_max_unroll + wavefront_idx * kTimeUnroll1; tt < t_max; tt += wavefronts_per_block * kTimeUnroll1) {
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
       const int32_t t = tt + ttt;
-      // &(cache_V[b][t][0][0]);
-      auto* v_ = cache_V_base + t * cache_V.stride(1);
-      //   scalar4<scalar_t> v_thread;
-      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, lane_idx);
+      if (t < t_max) {
+        // &(cache_V[b][t][0][0]);
+        auto* v_ = cache_V_base + t * cache_V.stride(1);
+        //   scalar4<scalar_t> v_thread;
+        k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, lane_idx);
 
-      ps[ttt] = smem[t];
+        ps[ttt] = smem[t];
+      }
     }
 
 #pragma unroll kTimeUnroll1
     for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
-      o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
+      }
     }
   }
   // now, each thread has partial sums. Write to smem and get accumulated

From afb61a970eb118e49c486de167caffefbe28633c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:42:40 -0400
Subject: [PATCH 156/641] refactor loop unroll controls into template
 parameters

---
 .../hip_fmha/attention_forward_decoder.cpp    | 58 +++++++++----------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index e805188da..2898eedc7 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -125,7 +125,7 @@ __device__ void store_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec value
   *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
-template<typename scalar_t>
+template<typename scalar_t, int32_t n_loop_unroll = 2, int32_t n_loop_unroll_tail = 2>
 __global__ void
 efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
@@ -135,9 +135,6 @@ efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
     const float qk_scale
 ) {
-  static_assert(4 * kThreadsPerWavefront == D_H, "");
-  static_assert(kWavefrontsPerBlock <= kThreadsPerWavefront, "");
-
   constexpr int32_t seq_positions_shift = 0;
 
   extern __shared__ __align__(16) float smem[];
@@ -176,24 +173,23 @@ efficient_attention_forward_decoder_ck_kernel(
   // Split T across wavefronts in a block, unroll loads to expose more
   // parallelism.
 
-  constexpr int32_t kTimeUnroll = 4;
-  data_vec4_t k_loads[kTimeUnroll];
+  data_vec4_t k_loads[n_loop_unroll];
 
-  const auto dtt = wavefronts_per_block * kTimeUnroll;
+  const auto dtt = wavefronts_per_block * n_loop_unroll;
   const int32_t t_max_unroll =
     (t_max / dtt) * dtt;
 
-  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll; tt += dtt) {
-#pragma unroll kTimeUnroll
-    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
       k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
     }
-#pragma unroll kTimeUnroll
-    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       float qk_acc = 0;
       const int32_t t = tt + ttt;
 
@@ -212,11 +208,10 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
-  constexpr int32_t kTimeUnroll1 = 4;
-  for (auto tt = t_max_unroll + wavefront_idx * kTimeUnroll1; tt < t_max;
-       tt += wavefronts_per_block * kTimeUnroll1) {
-#pragma unroll kTimeUnroll1
-    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+       tt += wavefronts_per_block * n_loop_unroll_tail) {
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
         // &(cache_K[b][t][0][0]);
@@ -225,8 +220,8 @@ efficient_attention_forward_decoder_ck_kernel(
         k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
       }
     }
-#pragma unroll kTimeUnroll1
-    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       float qk_acc = 0;
       const int32_t t = tt + ttt;
       if (t < t_max) {
@@ -290,11 +285,11 @@ efficient_attention_forward_decoder_ck_kernel(
   // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
   // outputs are of size float[D]
 
-  float ps[kTimeUnroll];
+  float ps[n_loop_unroll];
   ck::float4_t o_acc = 0;
-  for (auto tt = wavefront_idx * kTimeUnroll; tt < t_max_unroll; tt += dtt) {
-#pragma unroll kTimeUnroll
-    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
@@ -304,15 +299,15 @@ efficient_attention_forward_decoder_ck_kernel(
       ps[ttt] = smem[t];
     }
 
-#pragma unroll kTimeUnroll
-    for (auto ttt = 0; ttt < kTimeUnroll; ++ttt) {
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
     }
   }
 
-  for (auto tt = t_max_unroll + wavefront_idx * kTimeUnroll1; tt < t_max; tt += wavefronts_per_block * kTimeUnroll1) {
-#pragma unroll kTimeUnroll1
-    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max; tt += wavefronts_per_block * n_loop_unroll_tail) {
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
         // &(cache_V[b][t][0][0]);
@@ -324,8 +319,8 @@ efficient_attention_forward_decoder_ck_kernel(
       }
     }
 
-#pragma unroll kTimeUnroll1
-    for (auto ttt = 0; ttt < kTimeUnroll1; ++ttt) {
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
         o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
@@ -396,6 +391,9 @@ efficient_attention_forward_decoder_ck_impl(
     const at::Tensor& seq_positions, // [B]
     double qk_scale) {
 
+  static_assert(4 * ThreadsPerWavefront == D_H, "");
+  static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
+
   at::OptionalDeviceGuard guard(XQ.device());
   TORCH_CHECK(XQ.is_cuda());
   TORCH_CHECK(cache_K.is_cuda());

From 3690a3268721f828d9b2781da63b688131ef3882 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 18:18:13 -0400
Subject: [PATCH 157/641] add a comment and a static guard for unroll sizes

---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 2898eedc7..bac9c5da4 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -125,7 +125,7 @@ __device__ void store_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec value
   *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
-template<typename scalar_t, int32_t n_loop_unroll = 2, int32_t n_loop_unroll_tail = 2>
+template<typename scalar_t, int32_t n_loop_unroll = 4, int32_t n_loop_unroll_tail = 2>
 __global__ void
 efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
@@ -135,6 +135,8 @@ efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
     const float qk_scale
 ) {
+  static_assert (n_loop_unroll_tail < n_loop_unroll, "");
+
   constexpr int32_t seq_positions_shift = 0;
 
   extern __shared__ __align__(16) float smem[];
@@ -208,6 +210,7 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
+  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
   for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
        tt += wavefronts_per_block * n_loop_unroll_tail) {
 #pragma unroll n_loop_unroll_tail

From c996768d6b5667715ba10c455983268f3e45fea9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 9 Oct 2023 21:07:32 -0400
Subject: [PATCH 158/641] compare reference and tested attention when they are
 of same dtype as the compute dtype

---
 tests/test_mem_eff_attention_ck.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 71aed5445..f073bb76f 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -211,7 +211,7 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
     if q.ndim == 4:
         assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias, dtype=dtype)
     if dtype is None:
         dtype = torch.float32
     q = q.to(dtype=dtype)
@@ -244,7 +244,7 @@ def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dt
     return attn @ v
 
 
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -258,7 +258,7 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
@@ -1662,8 +1662,8 @@ def test_decoder(
     ref_output = ref_attention(q, k, v, attn_bias, dtype=dtype_)
 
     assert_allclose(
-        decoder_output.float(),
-        ref_output.float(),
+        decoder_output,
+        ref_output,
         atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
         rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
     )

From ab9ecc66c5a48ba294e270efeac5b4412d51cdd3 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Oct 2023 17:41:12 -0400
Subject: [PATCH 159/641] refactor inner product for bf16_4

---
 .../hip_fmha/attention_forward_decoder.cpp    | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index bac9c5da4..7670896e8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -21,26 +21,13 @@ __device__ void inner_product<bhalf_t, bhalf_t, float>(const bhalf_t& a, const b
 template <>
 __device__ void inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, const bhalf4_t& b, float& c)
 {
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I0],
-                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I0],
-                  c);
-
-    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I1],
-                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I1],
-                  c);
-
-    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I2],
-                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I2],
-                  c);
-
-    inner_product(vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I3],
-                  vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I3],
-                  c);
+    const vector_type<bhalf_t, 4> a_vector{a};
+    const vector_type<bhalf_t, 4> b_vector{b};
+    ck::static_for<0, 4, 1>{}([&] (auto i) {
+      inner_product(a_vector.AsType<bhalf_t>()[i],
+                    b_vector.AsType<bhalf_t>()[i],
+                    c);
+    });
 }
 } // namespace ck
 

From 10798569ac6e939f761861f35f7a3b6da25b863f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Oct 2023 23:51:54 -0400
Subject: [PATCH 160/641] refactor load to take a pointer to written value

---
 .../hip_fmha/attention_forward_decoder.cpp    | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 7670896e8..0a1363166 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -103,11 +103,13 @@ __device__ __forceinline__ wavefrontReduce(float val, F f) {
 }
 
 template <typename TDataPtr, typename TDataVec>
-__device__ TDataVec load_v(TDataPtr data_ptr, int32_t vector_offset) {
-  return *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+__forceinline__
+__device__ void load_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec* load_to) {
+   *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TDataPtr, typename TDataVec>
+__forceinline__
 __device__ void store_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec value) {
   *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
@@ -154,7 +156,8 @@ efficient_attention_forward_decoder_ck_kernel(
   // Each thread handles 4 D dimensions
   using data_t = typename c10_to_data_t<scalar_t>::type;
   using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
-  const data_vec4_t q_thread = load_v<decltype(q_), data_vec4_t>(q_, lane_idx);
+  data_vec4_t q_thread;
+  load_v<decltype(q_), data_vec4_t>(q_, lane_idx, &q_thread);
   // Each block computes different B value
   float max_qk_acc = std::numeric_limits<float>::lowest();
 
@@ -175,7 +178,7 @@ efficient_attention_forward_decoder_ck_kernel(
       // &(cache_K[b][t][0][0]);
       auto* k_ = cache_K_base + t * cache_K.stride(1);
       // scalar4<scalar_t> k_thread;
-      k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
+      load_v<decltype(k_), data_vec4_t>(k_, lane_idx, &k_loads[ttt]);
     }
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
@@ -207,7 +210,7 @@ efficient_attention_forward_decoder_ck_kernel(
         // &(cache_K[b][t][0][0]);
         auto* k_ = cache_K_base + t * cache_K.stride(1);
         // scalar4<scalar_t> k_thread;
-        k_loads[ttt] = load_v<decltype(k_), data_vec4_t>(k_, lane_idx);
+        load_v<decltype(k_), data_vec4_t>(k_, lane_idx, &k_loads[ttt]);
       }
     }
 #pragma unroll n_loop_unroll_tail
@@ -284,7 +287,7 @@ efficient_attention_forward_decoder_ck_kernel(
       // &(cache_V[b][t][0][0]);
       auto* v_ = cache_V_base + t * cache_V.stride(1);
       //   scalar4<scalar_t> v_thread;
-      k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, lane_idx);
+      load_v<decltype(v_), data_vec4_t>(v_, lane_idx, &k_loads[ttt]);
 
       ps[ttt] = smem[t];
     }
@@ -303,7 +306,7 @@ efficient_attention_forward_decoder_ck_kernel(
         // &(cache_V[b][t][0][0]);
         auto* v_ = cache_V_base + t * cache_V.stride(1);
         //   scalar4<scalar_t> v_thread;
-        k_loads[ttt] = load_v<decltype(v_), data_vec4_t>(v_, lane_idx);
+        load_v<decltype(v_), data_vec4_t>(v_, lane_idx, &k_loads[ttt]);
 
         ps[ttt] = smem[t];
       }
@@ -329,7 +332,8 @@ efficient_attention_forward_decoder_ck_kernel(
   if (wavefront_idx == 0) {
     ck::float4_t r = 0;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      auto partial_r = load_v<float*, ck::float4_t>(smem, w * threads_per_wavefront + lane_idx);
+      ck::float4_t partial_r; 
+      load_v<float*, ck::float4_t>(smem, w * threads_per_wavefront + lane_idx, &partial_r);
       r += partial_r;
     }
     // write output D row

From d901f9a107903e76371e9358449568a9147ec143 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Oct 2023 23:52:13 -0400
Subject: [PATCH 161/641] modify the benchmark to compare decoder kernel
 runtimes

```
[----------------------- attention ------------------------]
                                  |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------
      3batch-1keys-8heads-mq      |    125.5  |       79.4
      3batch-1keys-8heads         |    127.8  |       70.9
      3batch-1keys-16heads-mq     |    127.6  |       77.4
      3batch-1keys-16heads        |    129.0  |       72.1
      3batch-1keys-64heads-mq     |    170.4  |       77.6
      3batch-1keys-64heads        |    173.5  |       70.1
      500batch-7keys-8heads-mq    |   2849.8  |      255.0
      500batch-7keys-8heads       |   3022.9  |      235.8
      500batch-7keys-16heads-mq   |   5422.8  |      502.0
      500batch-7keys-16heads      |   5867.3  |      465.0
      500batch-7keys-64heads-mq   |  21003.5  |     1995.6
      500batch-7keys-64heads      |  23075.1  |     1947.1
      2batch-543keys-8heads-mq    |    539.7  |       78.6
      2batch-543keys-8heads       |    558.4  |       71.7
      2batch-543keys-16heads-mq   |    545.3  |       79.2
      2batch-543keys-16heads      |    600.0  |       71.1
      2batch-543keys-64heads-mq   |    556.7  |       78.3
      2batch-543keys-64heads      |    662.9  |       94.3
      1batch-5543keys-8heads-mq   |   4807.0  |      347.2
      1batch-5543keys-8heads      |   5029.2  |      398.2
      1batch-5543keys-16heads-mq  |   4802.6  |      346.1
      1batch-5543keys-16heads     |   5111.3  |      397.8
      1batch-5543keys-64heads-mq  |   4955.1  |      348.5
      1batch-5543keys-64heads     |   5070.0  |      444.9
      32batch-103keys-8heads-mq   |    470.2  |       78.1
      32batch-103keys-8heads      |    513.0  |       70.6
      32batch-103keys-16heads-mq  |    772.3  |      252.3
      32batch-103keys-16heads     |    875.5  |      223.8
      32batch-103keys-64heads-mq  |   2419.5  |      305.6
      32batch-103keys-64heads     |   2802.3  |      465.9
      4batch-1127keys-8heads-mq   |   1314.7  |      254.0
      4batch-1127keys-8heads      |   1428.8  |      217.0
      4batch-1127keys-16heads-mq  |   1330.8  |      245.4
      4batch-1127keys-16heads     |   1426.2  |      222.5
      4batch-1127keys-64heads-mq  |   2394.7  |      270.5
      4batch-1127keys-64heads     |   2899.2  |      371.0
      1batch-7271keys-8heads-mq   |   6410.9  |      475.9
      1batch-7271keys-8heads      |   6556.4  |      517.3
      1batch-7271keys-16heads-mq  |   6397.3  |      476.0
      1batch-7271keys-16heads     |   6744.6  |      518.9
      1batch-7271keys-64heads-mq  |   6500.3  |      478.3
      1batch-7271keys-64heads     |   6800.2  |      582.4

Times are in microseconds (us).

[----------------- cuda graphed attention -----------------]
                                  |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------
      3batch-1keys-8heads-mq      |    126.2  |       11.8
      3batch-1keys-8heads         |    128.8  |       11.8
      3batch-1keys-16heads-mq     |    127.9  |       11.8
      3batch-1keys-16heads        |    129.6  |       11.8
      3batch-1keys-64heads-mq     |    169.1  |       15.6
      3batch-1keys-64heads        |    174.0  |       15.7
      500batch-7keys-8heads-mq    |   2842.7  |      259.5
      500batch-7keys-8heads       |   3015.9  |      239.6
      500batch-7keys-16heads-mq   |   5417.3  |      506.5
      500batch-7keys-16heads      |   5909.0  |      468.4
      500batch-7keys-64heads-mq   |  20944.0  |     1999.1
      500batch-7keys-64heads      |  22998.4  |     1949.0
      2batch-543keys-8heads-mq    |    542.8  |       43.7
      2batch-543keys-8heads       |    558.2  |       46.1
      2batch-543keys-16heads-mq   |    538.5  |       43.8
      2batch-543keys-16heads      |    600.9  |       51.7
      2batch-543keys-64heads-mq   |    555.5  |       79.2
      2batch-543keys-64heads      |    662.1  |       98.7
      1batch-5543keys-8heads-mq   |   4807.8  |      351.3
      1batch-5543keys-8heads      |   5026.5  |      402.8
      1batch-5543keys-16heads-mq  |   4830.3  |      351.1
      1batch-5543keys-16heads     |   5111.1  |      402.2
      1batch-5543keys-64heads-mq  |   4955.5  |      352.8
      1batch-5543keys-64heads     |   5065.7  |      448.1
      32batch-103keys-8heads-mq   |    468.5  |       53.2
      32batch-103keys-8heads      |    516.0  |       65.0
      32batch-103keys-16heads-mq  |    774.0  |       88.0
      32batch-103keys-16heads     |    868.5  |      107.6
      32batch-103keys-64heads-mq  |   2411.4  |      310.5
      32batch-103keys-64heads     |   2794.5  |      471.8
      4batch-1127keys-8heads-mq   |   1313.4  |       97.8
      4batch-1127keys-8heads      |   1409.5  |      115.3
      4batch-1127keys-16heads-mq  |   1317.5  |       97.0
      4batch-1127keys-16heads     |   1413.1  |      118.4
      4batch-1127keys-64heads-mq  |   2378.3  |      274.9
      4batch-1127keys-64heads     |   2837.8  |      374.8
      1batch-7271keys-8heads-mq   |   6370.9  |      480.2
      1batch-7271keys-8heads      |   6534.8  |      521.9
      1batch-7271keys-16heads-mq  |   6450.0  |      484.1
      1batch-7271keys-16heads     |   6792.5  |      521.5
      1batch-7271keys-64heads-mq  |   6477.8  |      482.2
      1batch-7271keys-64heads     |   6588.6  |      586.3

Times are in microseconds (us).

```
---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index d63c79833..e37db17b9 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -60,6 +60,7 @@ def T(t):
 
 OPS = [
     xformers.ops.fmha.ck.FwOp,
+    xformers.ops.fmha.ck_decoder.FwOp
 ]
 
 KV_SHAPES = [
@@ -99,7 +100,7 @@ def mem_eff_attention_decoder(
     n_keys, padding, B = kv_shape
     torch.manual_seed(42)
     k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
-    K = 128
+    K = 256
 
     q = torch.rand(1, B, n_heads, K, device=device, dtype=torch.bfloat16)
     if multiquery:

From 84170deb0d5443e3da451a974c1cc13a8965844d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 11 Oct 2023 00:47:50 -0400
Subject: [PATCH 162/641] clang-format

---
 .../hip_fmha/attention_forward_decoder.cpp    | 287 ++++++++++--------
 1 file changed, 157 insertions(+), 130 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 0a1363166..0e879f9ff 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -1,33 +1,36 @@
-/* 
+/*
   TODO: license header
 */
 
 // #include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
-#include <ck/utility/inner_product.hpp>
-#include <c10/cuda/CUDAStream.h>
-#include <torch/library.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
+#include <c10/cuda/CUDAStream.h>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/inner_product.hpp>
+#include <torch/library.h>
 
 namespace ck {
 template <>
-__device__ void inner_product<bhalf_t, bhalf_t, float>(const bhalf_t& a, const bhalf_t& b, float& c)
-{
-    inner_product(type_convert<float>(a), type_convert<float>(b), c);
+__device__ void inner_product<bhalf_t, bhalf_t, float>(
+    const bhalf_t& a,
+    const bhalf_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
 }
 
 template <>
-__device__ void inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, const bhalf4_t& b, float& c)
-{
-    const vector_type<bhalf_t, 4> a_vector{a};
-    const vector_type<bhalf_t, 4> b_vector{b};
-    ck::static_for<0, 4, 1>{}([&] (auto i) {
-      inner_product(a_vector.AsType<bhalf_t>()[i],
-                    b_vector.AsType<bhalf_t>()[i],
-                    c);
-    });
+__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
+    const bhalf4_t& a,
+    const bhalf4_t& b,
+    float& c) {
+  const vector_type<bhalf_t, 4> a_vector{a};
+  const vector_type<bhalf_t, 4> b_vector{b};
+  ck::static_for<0, 4, 1>{}([&](auto i) {
+    inner_product(
+        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+  });
 }
 } // namespace ck
 
@@ -38,42 +41,43 @@ constexpr int32_t kWavefrontsPerBlock = 8;
 constexpr int32_t D_H = 256;
 constexpr int32_t T_MAX = 8192;
 
-template<typename c10_t>
+template <typename c10_t>
 struct c10_to_data_t;
 
-template<>
+template <>
 struct c10_to_data_t<float> {
-    using type = float;
-    using vec4 = ck::float4_t;
+  using type = float;
+  using vec4 = ck::float4_t;
 };
 
-template<>
+template <>
 struct c10_to_data_t<c10::Half> {
-    using type = ck::half_t;
-    using vec4 = ck::half4_t;
+  using type = ck::half_t;
+  using vec4 = ck::half4_t;
 };
 
-template<>
+template <>
 struct c10_to_data_t<c10::BFloat16> {
-    using type = ck::bhalf_t;
-    using vec4 = ck::bhalf4_t;
+  using type = ck::bhalf_t;
+  using vec4 = ck::bhalf4_t;
 };
 
-template<typename data4_t>
-__device__
-ck::float4_t scalar4_scale_acc(ck::float4_t acc, data4_t a, float b);
+template <typename data4_t>
+__device__ ck::float4_t scalar4_scale_acc(ck::float4_t acc, data4_t a, float b);
 
-template<>
-__device__  
-ck::float4_t
-scalar4_scale_acc<ck::float4_t>(ck::float4_t acc, ck::float4_t a, float b) {
+template <>
+__device__ ck::float4_t scalar4_scale_acc<ck::float4_t>(
+    ck::float4_t acc,
+    ck::float4_t a,
+    float b) {
   return acc + a * b;
 }
 
-template<>
-__device__  
-ck::float4_t
-scalar4_scale_acc<ck::half4_t>(ck::float4_t acc, ck::half4_t a, float b) {
+template <>
+__device__ ck::float4_t scalar4_scale_acc<ck::half4_t>(
+    ck::float4_t acc,
+    ck::half4_t a,
+    float b) {
   acc.x += ck::type_convert<float>(a.x) * b;
   acc.y += ck::type_convert<float>(a.y) * b;
   acc.z += ck::type_convert<float>(a.z) * b;
@@ -81,10 +85,11 @@ scalar4_scale_acc<ck::half4_t>(ck::float4_t acc, ck::half4_t a, float b) {
   return acc;
 }
 
-template<>
-__device__  
-ck::float4_t
-scalar4_scale_acc<ck::bhalf4_t>(ck::float4_t acc, ck::bhalf4_t a, float b) {
+template <>
+__device__ ck::float4_t scalar4_scale_acc<ck::bhalf4_t>(
+    ck::float4_t acc,
+    ck::bhalf4_t a,
+    float b) {
   acc.x += ck::type_convert<float>(a.x) * b;
   acc.y += ck::type_convert<float>(a.y) * b;
   acc.z += ck::type_convert<float>(a.z) * b;
@@ -93,8 +98,7 @@ scalar4_scale_acc<ck::bhalf4_t>(ck::float4_t acc, ck::bhalf4_t a, float b) {
 }
 
 template <typename F>
-float
-__device__ __forceinline__ wavefrontReduce(float val, F f) {
+float __device__ __forceinline__ wavefrontReduce(float val, F f) {
 #pragma unroll
   for (int32_t mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
     val = f(__shfl_xor(val, mask, kThreadsPerWavefront), val);
@@ -103,28 +107,33 @@ __device__ __forceinline__ wavefrontReduce(float val, F f) {
 }
 
 template <typename TDataPtr, typename TDataVec>
-__forceinline__
-__device__ void load_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec* load_to) {
-   *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+__forceinline__ __device__ void load_v(
+    TDataPtr data_ptr,
+    int32_t vector_offset,
+    TDataVec* load_to) {
+  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TDataPtr, typename TDataVec>
-__forceinline__
-__device__ void store_v(TDataPtr data_ptr, int32_t vector_offset, TDataVec value) {
+__forceinline__ __device__ void store_v(
+    TDataPtr data_ptr,
+    int32_t vector_offset,
+    TDataVec value) {
   *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
-template<typename scalar_t, int32_t n_loop_unroll = 4, int32_t n_loop_unroll_tail = 2>
-__global__ void
-efficient_attention_forward_decoder_ck_kernel(
+template <
+    typename scalar_t,
+    int32_t n_loop_unroll = 4,
+    int32_t n_loop_unroll_tail = 2>
+__global__ void efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
     at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K,
     at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V,
     at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O,
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
-    const float qk_scale
-) {
-  static_assert (n_loop_unroll_tail < n_loop_unroll, "");
+    const float qk_scale) {
+  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
   constexpr int32_t seq_positions_shift = 0;
 
@@ -142,8 +151,10 @@ efficient_attention_forward_decoder_ck_kernel(
   const int32_t wavefront_idx = threadIdx.y;
   const int32_t threads_per_wavefront = blockDim.x;
   const int32_t wavefronts_per_block = blockDim.y;
-  const int32_t threads_per_block = threads_per_wavefront * wavefronts_per_block;
-  const int32_t thread_linear_idx = lane_idx + wavefront_idx * threads_per_wavefront;
+  const int32_t threads_per_block =
+      threads_per_wavefront * wavefronts_per_block;
+  const int32_t thread_linear_idx =
+      lane_idx + wavefront_idx * threads_per_wavefront;
 
   // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
   const auto* q_ = &(XQ[b][0][h][0]);
@@ -168,8 +179,7 @@ efficient_attention_forward_decoder_ck_kernel(
   data_vec4_t k_loads[n_loop_unroll];
 
   const auto dtt = wavefronts_per_block * n_loop_unroll;
-  const int32_t t_max_unroll =
-    (t_max / dtt) * dtt;
+  const int32_t t_max_unroll = (t_max / dtt) * dtt;
 
   for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
 #pragma unroll n_loop_unroll
@@ -185,12 +195,11 @@ efficient_attention_forward_decoder_ck_kernel(
       float qk_acc = 0;
       const int32_t t = tt + ttt;
 
-      ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
-                                                         k_loads[ttt], 
-                                                         qk_acc);
+      ck::inner_product<data_vec4_t, data_vec4_t, float>(
+          q_thread, k_loads[ttt], qk_acc);
       qk_acc *= qk_scale;
 
-      qk_acc = wavefrontReduce(qk_acc, [] (float a, float b) { return a + b; });
+      qk_acc = wavefrontReduce(qk_acc, [](float a, float b) { return a + b; });
       max_qk_acc = max(qk_acc, max_qk_acc);
 
       // write accumulated sums to smem.
@@ -218,12 +227,12 @@ efficient_attention_forward_decoder_ck_kernel(
       float qk_acc = 0;
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        ck::inner_product<data_vec4_t, data_vec4_t, float>(q_thread, 
-                                                          k_loads[ttt], 
-                                                          qk_acc);
+        ck::inner_product<data_vec4_t, data_vec4_t, float>(
+            q_thread, k_loads[ttt], qk_acc);
         qk_acc *= qk_scale;
 
-        qk_acc = wavefrontReduce(qk_acc, [] (float a, float b) { return a + b; });
+        qk_acc =
+            wavefrontReduce(qk_acc, [](float a, float b) { return a + b; });
         max_qk_acc = max(qk_acc, max_qk_acc);
 
         // write accumulated sums to smem.
@@ -244,27 +253,30 @@ efficient_attention_forward_decoder_ck_kernel(
     max_qk_acc = max(max_qk_acc, smem[T_MAX + lane_idx]);
   }
   // shared across all threads in block
-  max_qk_acc = wavefrontReduce(max_qk_acc, [] (float a, float b) { return a > b ? a : b; });
+  max_qk_acc = wavefrontReduce(
+      max_qk_acc, [](float a, float b) { return a > b ? a : b; });
 
   // each wavefront computes partial sum of exp.
   float softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     softmax_denominator += expf(smem[t] - max_qk_acc);
   }
-  softmax_denominator = wavefrontReduce(softmax_denominator, [] (float a, float b) { return a + b; });
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](float a, float b) { return a + b; });
 
   __syncthreads();
   if (lane_idx == 0) {
     smem[T_MAX + wavefront_idx] = softmax_denominator;
   }
   __syncthreads();
-  
+
   // now, compute sum of exp(x - max(x)) over all intermediate results.
   softmax_denominator = 0.0;
   if (lane_idx < wavefronts_per_block) {
     softmax_denominator = smem[T_MAX + lane_idx];
   }
-  softmax_denominator = wavefrontReduce(softmax_denominator, [] (float a, float b) { return a + b; });
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](float a, float b) { return a + b; });
 
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
@@ -298,7 +310,8 @@ efficient_attention_forward_decoder_ck_kernel(
     }
   }
 
-  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max; tt += wavefronts_per_block * n_loop_unroll_tail) {
+  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+       tt += wavefronts_per_block * n_loop_unroll_tail) {
 #pragma unroll n_loop_unroll_tail
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
@@ -323,8 +336,8 @@ efficient_attention_forward_decoder_ck_kernel(
   // now, each thread has partial sums. Write to smem and get accumulated
   // results back.
   __syncthreads();
-  
-  // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock 
+
+  // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
   store_v<float*, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
 
   __syncthreads();
@@ -332,8 +345,9 @@ efficient_attention_forward_decoder_ck_kernel(
   if (wavefront_idx == 0) {
     ck::float4_t r = 0;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      ck::float4_t partial_r; 
-      load_v<float*, ck::float4_t>(smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      ck::float4_t partial_r;
+      load_v<float*, ck::float4_t>(
+          smem, w * threads_per_wavefront + lane_idx, &partial_r);
       r += partial_r;
     }
     // write output D row
@@ -347,11 +361,11 @@ efficient_attention_forward_decoder_ck_kernel(
   }
 }
 
-void update_max_dynamic_shared_memory_size_bytes(void* kernel_func, int32_t new_value) {
+void update_max_dynamic_shared_memory_size_bytes(
+    void* kernel_func,
+    int32_t new_value) {
   hipFuncAttributes attributes;
-  C10_CUDA_CHECK(hipFuncGetAttributes(
-      &attributes, 
-      kernel_func));
+  C10_CUDA_CHECK(hipFuncGetAttributes(&attributes, kernel_func));
 
   const auto default_value = attributes.maxDynamicSharedSizeBytes;
 
@@ -359,32 +373,29 @@ void update_max_dynamic_shared_memory_size_bytes(void* kernel_func, int32_t new_
 
   if (new_value > default_value) {
     C10_CUDA_CHECK(hipFuncSetAttribute(
-        kernel_func,
-        hipFuncAttributeMaxDynamicSharedMemorySize,
-        new_value));
+        kernel_func, hipFuncAttributeMaxDynamicSharedMemorySize, new_value));
   }
 }
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
-  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
-  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
-  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) 
-  
-#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(                                               \
-      TYPE,                                                         \
-      NAME,                                                         \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_SWITCH_3(                               \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
       AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
 
-template<int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
-at::Tensor
-efficient_attention_forward_decoder_ck_impl(
+template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
+at::Tensor efficient_attention_forward_decoder_ck_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
     const at::Tensor& seq_positions, // [B]
     double qk_scale) {
-
   static_assert(4 * ThreadsPerWavefront == D_H, "");
   static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
 
@@ -405,42 +416,47 @@ efficient_attention_forward_decoder_ck_impl(
   dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
   int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = D_H * sizeof(float) * threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  int32_t smem_output = D_H * sizeof(float) *
+      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
   int32_t smem_size = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
 
-  AT_DISPATCH_SWITCH_3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Float, 
-    XQ.scalar_type(), "efficient_attention_forward_decoder_ck", [&] {
-      auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
-      update_max_dynamic_shared_memory_size_bytes(reinterpret_cast<void*&>(kernel), smem_size);
-      kernel
-          <<<blocks, threads, smem_size, stream>>>(
-              XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-              cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-              cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-              O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-              seq_positions
-                  .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-              qk_scale);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-  });
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      XQ.scalar_type(),
+      "efficient_attention_forward_decoder_ck",
+      [&] {
+        auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
+        update_max_dynamic_shared_memory_size_bytes(
+            reinterpret_cast<void*&>(kernel), smem_size);
+        kernel<<<blocks, threads, smem_size, stream>>>(
+            XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+            cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+            cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+            O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+            seq_positions
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            qk_scale);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
 
   return O;
-}  
+}
 
 #undef AT_DISPATCH_CASE_3
 #undef AT_DISPATCH_SWITCH_3
 
-at::Tensor
-efficient_attention_forward_decoder_ck(
+at::Tensor efficient_attention_forward_decoder_ck(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
     const at::Tensor& seq_positions, // [B]
     double qk_scale) {
-  return efficient_attention_forward_decoder_ck_impl<kThreadsPerWavefront, kWavefrontsPerBlock> (
-    XQ, cache_K, cache_V, seq_positions, qk_scale
-  );
+  return efficient_attention_forward_decoder_ck_impl<
+      kThreadsPerWavefront,
+      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_positions, qk_scale);
 }
 } // namespace
 
@@ -464,11 +480,15 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 -I/xformers/xformers/csrc/attention/hip_fmha \
 -I/xformers/third_party/composable_kernel/include \
 -I/xformers/third_party/composable_kernel/include/ck \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device
+\
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl
+\
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element
+\
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include
+\
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/TH \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THC \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THH \
@@ -510,7 +530,9 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 -o a.out
 
 (3) run
- > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib ./a.out
+ >
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
+./a.out
 */
 
 int main(int argc, char** argv) {
@@ -518,10 +540,10 @@ int main(int argc, char** argv) {
   const int32_t B = 1;
   const int32_t H = 4;
   auto options = torch::TensorOptions()
-    .dtype(torch::kFloat32)
-    .layout(torch::kStrided)
-    .device(torch::kCUDA, 1)
-    .requires_grad(false);
+                     .dtype(torch::kFloat32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCUDA, 1)
+                     .requires_grad(false);
   auto int_options = options.dtype(torch::kInt);
   auto XQ = at::randn({B, 1, H, D}, options);
   auto K = at::randn({B, 4096, H, D}, options);
@@ -529,11 +551,16 @@ int main(int argc, char** argv) {
   auto seq = at::randint(63, 128, {B}, int_options);
   double qk_scale = 1. / sqrt(D);
 
-  auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(XQ, K, V, seq, qk_scale);
-  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(XQ, K, V, seq, qk_scale);
-  auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(
+      XQ, K, V, seq, qk_scale);
+  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(
+      XQ, K, V, seq, qk_scale);
+  auto mask = at::isclose(
+      result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
   auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-  printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
+  printf(
+      "Mismatched elements percentage: %.2f\n",
+      1. - percent_match.item<float>());
   return 0;
 }
 

From 5c6b572c0c5a7dec7bf99da191fb70edab876e92 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 11 Oct 2023 13:21:53 -0400
Subject: [PATCH 163/641] refactor decoder benchmark

---
 .../benchmarks/benchmark_mem_eff_attn_decoder_ck.py   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index e37db17b9..6d1422e65 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -101,18 +101,19 @@ def mem_eff_attention_decoder(
     torch.manual_seed(42)
     k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
     K = 256
+    dtype = torch.float16
 
-    q = torch.rand(1, B, n_heads, K, device=device, dtype=torch.bfloat16)
+    q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
     if multiquery:
         k = torch.rand(
-            1, B * padding, 1, K, device=device, dtype=torch.bfloat16
+            1, B * padding, 1, K, device=device, dtype=dtype
         ).expand(1, B * padding, n_heads, K)
         v = torch.rand(
-            1, B * padding, 1, K, device=device, dtype=torch.bfloat16
+            1, B * padding, 1, K, device=device, dtype=dtype
         ).expand(1, B * padding, n_heads, K)
     else:
-        k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=torch.bfloat16)
-        v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=torch.bfloat16)
+        k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
+        v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
 
     bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
         q_seqlen=[1] * B,

From f2013d0a6c52532a5921726dd88e7cd4ae62552d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 11 Oct 2023 14:51:02 -0400
Subject: [PATCH 164/641] rebase on ck-flashattn


From bd6ee76ffafe573b13a20ffb1a66f366fc0ac88d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 11 Oct 2023 16:13:20 -0400
Subject: [PATCH 165/641] add a doc for the xformer op

---
 xformers/ops/fmha/ck_decoder.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 2c7d1ead8..28db52eaa 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -1,5 +1,4 @@
 # TODO(max): add a proper copyright header
-import math
 import torch
 
 from typing import Any, Set, List, Tuple, Optional
@@ -9,10 +8,14 @@
 
 @register_operator
 class FwOp(AttentionFwOpBase):
+    """
+    An operator optimized for K=256 (so the contiguous dim fits into registers).
+    Tested to work on MI250x.
+    """
     OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16, torch.float}
-    SUPPORTED_MAX_K: float = 256
+    SUPPORTED_MAX_K: int = 256
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {BlockDiagonalCausalWithOffsetPaddedKeysMask}
     SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
@@ -31,8 +34,8 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             if d.query.shape[0] != 1:
                 reasons.append("One formal batch element expected")
 
-            if d.query.shape[-1] != 256:
-                reasons.append("Only head_dim==256 for now.")
+            if d.query.shape[-1] != cls.SUPPORTED_MAX_K:
+                reasons.append(f"Got head_dim={d.query.shape[-1]}; only head_dim=={cls.SUPPORTED_MAX_K} is supported for now.")
 
             if d.key.stride(-1) != 1:
                 reasons.append("expect keys to have last dim contiguous")
@@ -79,7 +82,7 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = 1.0 / math.sqrt(key.shape[-1])
+            qk_scale = torch.rsqrt(torch.tensor(key.shape[-1], dtype=torch.float32))
 
         out = cls.OPERATOR(
             query=query,

From a1552f8ddf742017c19b54726b585f3ef968cbb0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 11 Oct 2023 17:01:12 -0400
Subject: [PATCH 166/641] simplify K/V loads

---
 .../hip_fmha/attention_forward_decoder.cpp    | 31 ++++++++-----------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 0e879f9ff..60e07e187 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -185,10 +185,9 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
-      // &(cache_K[b][t][0][0]);
-      auto* k_ = cache_K_base + t * cache_K.stride(1);
-      // scalar4<scalar_t> k_thread;
-      load_v<decltype(k_), data_vec4_t>(k_, lane_idx, &k_loads[ttt]);
+      // load the K[b][t][h|0][:] row into registers
+      load_v<decltype(cache_K_base), data_vec4_t>(
+          cache_K_base + t * cache_K.stride(1), lane_idx, &k_loads[ttt]);
     }
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
@@ -216,10 +215,9 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        // &(cache_K[b][t][0][0]);
-        auto* k_ = cache_K_base + t * cache_K.stride(1);
-        // scalar4<scalar_t> k_thread;
-        load_v<decltype(k_), data_vec4_t>(k_, lane_idx, &k_loads[ttt]);
+        // load the K[b][t][h|0][:] row into registers
+        load_v<decltype(cache_K_base), data_vec4_t>(
+            cache_K_base + t * cache_K.stride(1), lane_idx, &k_loads[ttt]);
       }
     }
 #pragma unroll n_loop_unroll_tail
@@ -296,11 +294,9 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
-      // &(cache_V[b][t][0][0]);
-      auto* v_ = cache_V_base + t * cache_V.stride(1);
-      //   scalar4<scalar_t> v_thread;
-      load_v<decltype(v_), data_vec4_t>(v_, lane_idx, &k_loads[ttt]);
-
+      // load the V[b][t][h|0][:] row into registers, reusing K register storage
+      load_v<decltype(cache_V_base), data_vec4_t>(
+          cache_V_base + t * cache_V.stride(1), lane_idx, &k_loads[ttt]);
       ps[ttt] = smem[t];
     }
 
@@ -316,11 +312,10 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        // &(cache_V[b][t][0][0]);
-        auto* v_ = cache_V_base + t * cache_V.stride(1);
-        //   scalar4<scalar_t> v_thread;
-        load_v<decltype(v_), data_vec4_t>(v_, lane_idx, &k_loads[ttt]);
-
+        // load the V[b][t][h|0][:] row into registers, reusing K register
+        // storage
+        load_v<decltype(cache_V_base), data_vec4_t>(
+            cache_V_base + t * cache_V.stride(1), lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
     }

From 185e12b6491552cafa6ec2328a0a6faa27085804 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 16 Oct 2023 19:00:05 -0400
Subject: [PATCH 167/641] reset the cache before running each iteration

when introspecting the hardware counters for the benchmarked kernels,
I noticed there is no global memory traffic when the input shape is small.

Meaning, *probably* the inputs are fetched from cache.

To make benchmarking more authentic, I added a gpu memory slab fill for each iteration.

I also benchmarked it separately, so we can mentally adjust the op iteration time by the slab-fill iteration time

See also: https://stackoverflow.com/a/34461372

Results (note how for large input shapes the new reported results adjusted by slab fill time are about same as previous,
while for small input shapes the new times are larger, due to cache reset):
```
Times are in microseconds (us).

[-------- reset cache ---------]
                      |  elapsed
1 threads: ---------------------
      mem_slab.fill_  |   158.0

Times are in microseconds (us).

[----------------------- attention ------------------------]
                                  |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------
      3batch-1keys-8heads-mq      |    245.8  |      462.3
      3batch-1keys-8heads         |    248.7  |      467.0
      3batch-1keys-16heads-mq     |    247.5  |      464.2
      3batch-1keys-16heads        |    248.8  |      479.4
      3batch-1keys-64heads-mq     |    325.2  |      462.8
      3batch-1keys-64heads        |    335.1  |      473.3
      500batch-7keys-8heads-mq    |   3197.7  |      491.7
      500batch-7keys-8heads       |   3265.6  |      468.3
      500batch-7keys-16heads-mq   |   5731.2  |      742.2
      500batch-7keys-16heads      |   6021.8  |      688.4
      500batch-7keys-64heads-mq   |  21145.3  |     2193.9
      500batch-7keys-64heads      |  22591.7  |     2141.0
      2batch-543keys-8heads-mq    |    496.0  |      511.7
      2batch-543keys-8heads       |    501.1  |      506.8
      2batch-543keys-16heads-mq   |    492.4  |      492.7
      2batch-543keys-16heads      |    505.9  |      514.5
      2batch-543keys-64heads-mq   |    573.2  |      479.8
      2batch-543keys-64heads      |    635.8  |      459.3
      1batch-5543keys-8heads-mq   |   2927.1  |      630.6
      1batch-5543keys-8heads      |   2922.0  |      619.0
      1batch-5543keys-16heads-mq  |   2924.4  |      629.8
      1batch-5543keys-16heads     |   2962.2  |      620.5
      1batch-5543keys-64heads-mq  |   3516.0  |      633.2
      1batch-5543keys-64heads     |   4156.4  |      662.9
      32batch-103keys-8heads-mq   |    583.4  |      528.1
      32batch-103keys-8heads      |    613.2  |      453.7
      32batch-103keys-16heads-mq  |    853.7  |      470.3
      32batch-103keys-16heads     |    904.1  |      406.7
      32batch-103keys-64heads-mq  |   2523.5  |      548.2
      32batch-103keys-64heads     |   2826.9  |      703.5
      4batch-1127keys-8heads-mq   |    908.7  |      442.7
      4batch-1127keys-8heads      |    941.9  |      358.0
      4batch-1127keys-16heads-mq  |    983.6  |      415.8
      4batch-1127keys-16heads     |   1125.7  |      403.0
      4batch-1127keys-64heads-mq  |   2407.6  |      519.4
      4batch-1127keys-64heads     |   2760.2  |      600.4
      1batch-7271keys-8heads-mq   |   3742.3  |      751.4
      1batch-7271keys-8heads      |   3735.1  |      736.2
      1batch-7271keys-16heads-mq  |   3738.9  |      749.4
      1batch-7271keys-16heads     |   3786.2  |      739.9
      1batch-7271keys-64heads-mq  |   4510.4  |      755.3
      1batch-7271keys-64heads     |   5336.9  |      801.0

Times are in microseconds (us).

[----------------- cuda graphed attention -----------------]
                                  |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------
      3batch-1keys-8heads-mq      |    250.3  |      266.0
      3batch-1keys-8heads         |    248.5  |      271.5
      3batch-1keys-16heads-mq     |    249.1  |      289.6
      3batch-1keys-16heads        |    249.2  |      292.1
      3batch-1keys-64heads-mq     |    325.8  |      295.4
      3batch-1keys-64heads        |    329.9  |      276.3
      500batch-7keys-8heads-mq    |   3192.3  |      501.5
      500batch-7keys-8heads       |   3296.0  |      481.7
      500batch-7keys-16heads-mq   |   5722.4  |      745.7
      500batch-7keys-16heads      |   6008.1  |      698.7
      500batch-7keys-64heads-mq   |  21090.6  |     2202.3
      500batch-7keys-64heads      |  22540.3  |     2185.5
      2batch-543keys-8heads-mq    |    493.0  |      292.8
      2batch-543keys-8heads       |    502.2  |      301.1
      2batch-543keys-16heads-mq   |    491.9  |      299.8
      2batch-543keys-16heads      |    505.5  |      301.6
      2batch-543keys-64heads-mq   |    573.9  |      328.2
      2batch-543keys-64heads      |    635.6  |      337.4
      1batch-5543keys-8heads-mq   |   2929.3  |      641.1
      1batch-5543keys-8heads      |   2926.9  |      629.2
      1batch-5543keys-16heads-mq  |   2927.8  |      647.6
      1batch-5543keys-16heads     |   2964.9  |      629.7
      1batch-5543keys-64heads-mq  |   3519.0  |      643.6
      1batch-5543keys-64heads     |   4159.2  |      677.6
      32batch-103keys-8heads-mq   |    582.8  |      306.5
      32batch-103keys-8heads      |    612.1  |      305.6
      32batch-103keys-16heads-mq  |    844.8  |      331.3
      32batch-103keys-16heads     |    900.9  |      351.3
      32batch-103keys-64heads-mq  |   2522.5  |      553.4
      32batch-103keys-64heads     |   2827.7  |      711.1
      4batch-1127keys-8heads-mq   |    908.4  |      353.5
      4batch-1127keys-8heads      |    941.3  |      352.2
      4batch-1127keys-16heads-mq  |    984.4  |      351.6
      4batch-1127keys-16heads     |   1126.7  |      359.0
      4batch-1127keys-64heads-mq  |   2407.5  |      529.3
      4batch-1127keys-64heads     |   2759.1  |      618.0
      1batch-7271keys-8heads-mq   |   3742.9  |      767.3
      1batch-7271keys-8heads      |   3738.6  |      743.1
      1batch-7271keys-16heads-mq  |   3746.6  |      758.8
      1batch-7271keys-16heads     |   3793.3  |      748.5
      1batch-7271keys-64heads-mq  |   4510.7  |      764.6
      1batch-7271keys-64heads     |   5347.4  |      812.6

Times are in microseconds (us).
```
---
 .../benchmark_mem_eff_attn_decoder_ck.py       | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index 6d1422e65..5870319ba 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -125,7 +125,21 @@ def mem_eff_attention_decoder(
     if multiquery:
         sub_label += "-mq"
 
+    cache_size = 128 * 2 ** 20
+    mem_slab = torch.zeros(cache_size, device=device, dtype=torch.uint8)
+    cache_reset_str = "mem_slab.fill_(42)"
+
     has_run = False
+
+    yield benchmark.Timer(
+        stmt=cache_reset_str,
+        globals={"mem_slab": mem_slab},
+        label="reset cache",
+        sub_label="mem_slab.fill_",
+        num_threads=num_threads,
+        description="elapsed",
+    )
+
     for fw_op in OPS:
         inp = fmha.Inputs(q, k, v, attn_bias=bias)
         if (skip_reasons := fw_op.not_supported_reasons(inp)):
@@ -135,13 +149,14 @@ def mem_eff_attention_decoder(
         fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
 
         yield benchmark.Timer(
-            stmt="fn(q, k, v, attn_bias)",
+            stmt=f"{cache_reset_str};fn(q, k, v, attn_bias)",
             globals={
                 "q": q,
                 "k": k,
                 "v": v,
                 "attn_bias": bias,
                 "fn": fn,
+                "mem_slab": mem_slab,
             },
             label="attention",
             description=fw_op.NAME,
@@ -151,6 +166,7 @@ def mem_eff_attention_decoder(
 
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph):
+            exec(cache_reset_str, {"mem_slab": mem_slab})
             fn(q, k, v, bias)
         yield benchmark.Timer(
             stmt="graph.replay()",

From 1745b0c05183888d206bcde7d7c7c7850bdcfff4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 16 Oct 2023 19:19:33 -0400
Subject: [PATCH 168/641] clean up a hardcoded constant

---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 60e07e187..f6635bb98 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -38,7 +38,7 @@ namespace {
 
 constexpr int32_t kThreadsPerWavefront = 64;
 constexpr int32_t kWavefrontsPerBlock = 8;
-constexpr int32_t D_H = 256;
+constexpr int32_t D_H = 4 * kThreadsPerWavefront;
 constexpr int32_t T_MAX = 8192;
 
 template <typename c10_t>

From 9324ac64050372598a1299d619469ad4f99beaee Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 16 Oct 2023 19:36:02 -0400
Subject: [PATCH 169/641] refactor the cache reset

---
 .../benchmark_mem_eff_attn_decoder_ck.py          | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index 5870319ba..df197ec0f 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -127,15 +127,16 @@ def mem_eff_attention_decoder(
 
     cache_size = 128 * 2 ** 20
     mem_slab = torch.zeros(cache_size, device=device, dtype=torch.uint8)
-    cache_reset_str = "mem_slab.fill_(42)"
+    def reset_cache():
+        mem_slab.fill_(42)
 
     has_run = False
 
     yield benchmark.Timer(
-        stmt=cache_reset_str,
-        globals={"mem_slab": mem_slab},
+        stmt="reset_cache()",
+        globals={"reset_cache": reset_cache},
         label="reset cache",
-        sub_label="mem_slab.fill_",
+        sub_label=f"fill {cache_size=}",
         num_threads=num_threads,
         description="elapsed",
     )
@@ -149,14 +150,14 @@ def mem_eff_attention_decoder(
         fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
 
         yield benchmark.Timer(
-            stmt=f"{cache_reset_str};fn(q, k, v, attn_bias)",
+            stmt=f"reset_cache();fn(q, k, v, attn_bias)",
             globals={
                 "q": q,
                 "k": k,
                 "v": v,
                 "attn_bias": bias,
                 "fn": fn,
-                "mem_slab": mem_slab,
+                "reset_cache": reset_cache,
             },
             label="attention",
             description=fw_op.NAME,
@@ -166,7 +167,7 @@ def mem_eff_attention_decoder(
 
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph):
-            exec(cache_reset_str, {"mem_slab": mem_slab})
+            reset_cache()
             fn(q, k, v, bias)
         yield benchmark.Timer(
             stmt="graph.replay()",

From f643c634ea79f99c60c7f86b50b864a52fe5b89e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 16 Oct 2023 23:56:00 -0400
Subject: [PATCH 170/641] add read and write sizes to benchmark labels

---
 .../benchmark_mem_eff_attn_decoder_ck.py           | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index df197ec0f..0cb9ab3ad 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -13,6 +13,7 @@
 
 import xformers.ops
 import xformers.ops.fmha as fmha
+import xformers.profiler.slow_ops_profiler
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
@@ -125,7 +126,7 @@ def mem_eff_attention_decoder(
     if multiquery:
         sub_label += "-mq"
 
-    cache_size = 128 * 2 ** 20
+    cache_size = 512 * 2 ** 20
     mem_slab = torch.zeros(cache_size, device=device, dtype=torch.uint8)
     def reset_cache():
         mem_slab.fill_(42)
@@ -149,6 +150,13 @@ def reset_cache():
 
         fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
 
+        out = fn(q, k, v, attn_bias=bias, op=fw_op)
+        
+        inputs_size = xformers.profiler.slow_ops_profiler.get_size([q, k, v, bias])
+        outputs_size = xformers.profiler.slow_ops_profiler.get_size([out])
+
+        sizes_label = f"read-{inputs_size//1024}k-write-{outputs_size//1024}k"
+
         yield benchmark.Timer(
             stmt=f"reset_cache();fn(q, k, v, attn_bias)",
             globals={
@@ -161,7 +169,7 @@ def reset_cache():
             },
             label="attention",
             description=fw_op.NAME,
-            sub_label=sub_label,
+            sub_label=f"{sub_label}_{sizes_label}",
             num_threads=num_threads,
         )
 
@@ -176,7 +184,7 @@ def reset_cache():
             },
             label="cuda graphed attention",
             description=fw_op.NAME,
-            sub_label=sub_label,
+            sub_label=f"{sub_label}_{sizes_label}",
             num_threads=num_threads,
         )
 

From 0bb296f602327c977de6f97da956e2c665ce8a4b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 01:38:17 -0400
Subject: [PATCH 171/641] be more conservative about the slab size; otherwise,
 the memory fill run time starts dominating the benchmark and the significant
 digits are lost

---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index 0cb9ab3ad..4f52c2b3e 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -126,7 +126,7 @@ def mem_eff_attention_decoder(
     if multiquery:
         sub_label += "-mq"
 
-    cache_size = 512 * 2 ** 20
+    cache_size = 80 * 2 ** 20
     mem_slab = torch.zeros(cache_size, device=device, dtype=torch.uint8)
     def reset_cache():
         mem_slab.fill_(42)

From 5b97f186ec4ac4308466f4f3a60b131c5676f66b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 12:08:01 -0400
Subject: [PATCH 172/641] revert the cache reset in python benchmark

---
 .../benchmark_mem_eff_attn_decoder_ck.py       | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index 4f52c2b3e..c28ce006d 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -126,22 +126,8 @@ def mem_eff_attention_decoder(
     if multiquery:
         sub_label += "-mq"
 
-    cache_size = 80 * 2 ** 20
-    mem_slab = torch.zeros(cache_size, device=device, dtype=torch.uint8)
-    def reset_cache():
-        mem_slab.fill_(42)
-
     has_run = False
 
-    yield benchmark.Timer(
-        stmt="reset_cache()",
-        globals={"reset_cache": reset_cache},
-        label="reset cache",
-        sub_label=f"fill {cache_size=}",
-        num_threads=num_threads,
-        description="elapsed",
-    )
-
     for fw_op in OPS:
         inp = fmha.Inputs(q, k, v, attn_bias=bias)
         if (skip_reasons := fw_op.not_supported_reasons(inp)):
@@ -158,14 +144,13 @@ def reset_cache():
         sizes_label = f"read-{inputs_size//1024}k-write-{outputs_size//1024}k"
 
         yield benchmark.Timer(
-            stmt=f"reset_cache();fn(q, k, v, attn_bias)",
+            stmt=f"fn(q, k, v, attn_bias)",
             globals={
                 "q": q,
                 "k": k,
                 "v": v,
                 "attn_bias": bias,
                 "fn": fn,
-                "reset_cache": reset_cache,
             },
             label="attention",
             description=fw_op.NAME,
@@ -175,7 +160,6 @@ def reset_cache():
 
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph):
-            reset_cache()
             fn(q, k, v, bias)
         yield benchmark.Timer(
             stmt="graph.replay()",

From ed5a8208c2515e50045ad4c72b6b8828650166fd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 14:19:38 -0400
Subject: [PATCH 173/641] add memory traffic to the label

---
 .../benchmark_mem_eff_attn_decoder_ck.py      | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index c28ce006d..460279c7f 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -94,6 +94,23 @@ def product_dict(**kwargs):
     )
 )
 
+def get_memory_traffic(op, q, k, v, bias):
+    # mem_size = ( batch_size * seq_len * 1 * dim_per_head * 2 (K/V) + 
+    #              batch_size * 1 * num_heads * dim_per_head (Q) + 
+    #              batch_size * seq_len * num_heads * dim_per_head (attn_output) ) * bytes_per_element
+    out = xformers.ops.memory_efficient_attention_forward(q, k, v, bias, op=op)
+    dtype = q.dtype
+    multiquery = k.stride(2) == 0
+    n_heads = q.shape[-2]
+    dim_per_head = q.shape[-1]
+    kv_seqlen = bias.k_seqinfo.seqlen_py
+    bytes_per_element = 4 if dtype is torch.float32 else 2 if dtype in (torch.float16, torch.bfloat16) else None
+    mem_size = 0
+    mem_size += q.numel() * bytes_per_element # Q
+    for s in kv_seqlen: # len(kv_seqlen) == batch_size
+        mem_size += s * (1 if multiquery else n_heads) * dim_per_head * bytes_per_element * 2 # K, V
+    mem_size += out.numel() * bytes_per_element # attn_output
+    return mem_size
 
 def mem_eff_attention_decoder(
     kv_shape, n_heads: int, num_threads: int, multiquery: bool
@@ -103,7 +120,6 @@ def mem_eff_attention_decoder(
     k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
     K = 256
     dtype = torch.float16
-
     q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
     if multiquery:
         k = torch.rand(
@@ -136,12 +152,7 @@ def mem_eff_attention_decoder(
 
         fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
 
-        out = fn(q, k, v, attn_bias=bias, op=fw_op)
-        
-        inputs_size = xformers.profiler.slow_ops_profiler.get_size([q, k, v, bias])
-        outputs_size = xformers.profiler.slow_ops_profiler.get_size([out])
-
-        sizes_label = f"read-{inputs_size//1024}k-write-{outputs_size//1024}k"
+        mem_size = get_memory_traffic(fw_op, q, k, v, bias)
 
         yield benchmark.Timer(
             stmt=f"fn(q, k, v, attn_bias)",
@@ -154,7 +165,7 @@ def mem_eff_attention_decoder(
             },
             label="attention",
             description=fw_op.NAME,
-            sub_label=f"{sub_label}_{sizes_label}",
+            sub_label=f"{sub_label}_{mem_size//1024}k",
             num_threads=num_threads,
         )
 
@@ -168,7 +179,7 @@ def mem_eff_attention_decoder(
             },
             label="cuda graphed attention",
             description=fw_op.NAME,
-            sub_label=f"{sub_label}_{sizes_label}",
+            sub_label=f"{sub_label}_{mem_size//1024}k",
             num_threads=num_threads,
         )
 

From c5470e4872945c047a9aa5dddd9183d8396f7461 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 16:07:14 -0400
Subject: [PATCH 174/641] modify standalone launch mode to accept input options

---
 .../hip_fmha/attention_forward_decoder.cpp    | 110 +++++++++++++++---
 1 file changed, 93 insertions(+), 17 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index f6635bb98..a98fbe804 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -385,12 +385,13 @@ void update_max_dynamic_shared_memory_size_bytes(
       AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
-at::Tensor efficient_attention_forward_decoder_ck_impl(
+at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
     const at::Tensor& seq_positions, // [B]
-    double qk_scale) {
+    double qk_scale,
+    at::Tensor& O) {
   static_assert(4 * ThreadsPerWavefront == D_H, "");
   static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
 
@@ -404,7 +405,6 @@ at::Tensor efficient_attention_forward_decoder_ck_impl(
   TORCH_CHECK(cache_K.size(1) <= T_MAX);
   TORCH_CHECK(cache_K.size(3) == D_H);
 
-  auto O = at::empty_like(XQ);
   auto B = XQ.size(0);
   auto H = XQ.size(2);
   dim3 blocks(B, H);
@@ -443,6 +443,20 @@ at::Tensor efficient_attention_forward_decoder_ck_impl(
 #undef AT_DISPATCH_CASE_3
 #undef AT_DISPATCH_SWITCH_3
 
+template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
+at::Tensor efficient_attention_forward_decoder_ck_impl(
+    const at::Tensor& XQ, // [B, 1, H, D]
+    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+    const at::Tensor& seq_positions, // [B]
+    double qk_scale) {
+  auto O = at::empty_like(XQ);
+  efficient_attention_forward_decoder_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
+    XQ, cache_K, cache_V, seq_positions, qk_scale, O
+  );
+  return O;
+}
+
 at::Tensor efficient_attention_forward_decoder_ck(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
@@ -475,15 +489,11 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 -I/xformers/xformers/csrc/attention/hip_fmha \
 -I/xformers/third_party/composable_kernel/include \
 -I/xformers/third_party/composable_kernel/include/ck \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device
-\
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl
-\
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element
-\
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include
-\
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/TH \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THC \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THH \
@@ -524,14 +534,17 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 -lamdhip64 \
 -o a.out
 
-(3) run
- >
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
-./a.out
+(3a) run correctness check
+ > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
+ ./a.out
+
+(3b) run specific input shape
+ > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
+ ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block
 */
 
-int main(int argc, char** argv) {
-  const int32_t D = 256;
+static void do_correctness_check() {
+  const int32_t D = 4 * kThreadsPerWavefront;
   const int32_t B = 1;
   const int32_t H = 4;
   auto options = torch::TensorOptions()
@@ -556,6 +569,69 @@ int main(int argc, char** argv) {
   printf(
       "Mismatched elements percentage: %.2f\n",
       1. - percent_match.item<float>());
+}
+
+int main(int argc, char** argv) {
+  if (argc == 1) {
+    do_correctness_check();
+  } else {
+    const auto args = std::vector<std::string>(argv + 1, argv + argc);
+    if (args.size() != 7) {
+      std::cout << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block" << std::endl;
+      return 0;
+    }
+    const int32_t n_keys = std::stoi(args[0]);
+    const int32_t padding = std::stoi(args[1]);
+    const int32_t batch_size = std::stoi(args[2]);
+    const int32_t n_heads = std::stoi(args[3]);
+    const int32_t multiquery = (args[4] == "mq");
+    const auto dtype = (args[5] == "f32") ? torch::kFloat32 : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
+    
+    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
+
+    const auto options = torch::TensorOptions()
+                     .dtype(dtype)
+                     .layout(torch::kStrided)
+                     .device(torch::kCUDA, 1)
+                     .requires_grad(false);
+
+    const auto int_options = options.dtype(torch::kInt);  
+    const auto Q = at::rand({1, batch_size, n_heads, dim_per_head}, options);
+    const auto K = multiquery 
+      ? at::rand({1, batch_size * padding, 1, dim_per_head}, options).expand({1, batch_size * padding, n_heads, dim_per_head})
+      : at::rand({1, batch_size * padding, 1, dim_per_head}, options);
+    const auto V = at::rand_like(K);
+    auto O = at::rand_like(Q);
+
+    const auto seq = at::randint(1, n_keys, {batch_size}, int_options);
+    const double qk_scale = 1. / sqrt(dim_per_head);
+    auto call_ptr = decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, kWavefrontsPerBlock>) {};
+    
+    #define SWITCH_CASE_SET_CALLPTR(n) \
+    case (n): \
+      call_ptr = &efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, (n)>; \
+      break; 
+
+    switch(n_wavefronts_per_block) {
+      SWITCH_CASE_SET_CALLPTR(1);
+      SWITCH_CASE_SET_CALLPTR(2);
+      SWITCH_CASE_SET_CALLPTR(4);
+      SWITCH_CASE_SET_CALLPTR(8);
+      SWITCH_CASE_SET_CALLPTR(16);
+
+      default:
+        call_ptr = nullptr;
+        break;
+    }
+    #undef SWITCH_CASE_SET_CALLPTR
+
+    if (call_ptr) {
+      call_ptr(Q, K, V, seq, qk_scale, O);
+    } else {
+      std::cout << "Warning: no kernel was found for wavefronts_per_block=" << n_wavefronts_per_block << std::endl;
+    }
+  }
   return 0;
 }
 

From aab1bb85876856eba813b0b30b00ec4c516b2cc3 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 16:43:16 -0400
Subject: [PATCH 175/641] set wavefronts per block to 16 as this seems to be
 strictly better than anything less

wpb=32 doesn't work because mi200 hardware doesn't support more than 1024 threads per block

```
Times are in microseconds (us).

[-------------------------- attention ---------------------------]
                                        |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------------
      3batch-1keys-8heads-mq_28k        |     96.4  |      109.4
      3batch-1keys-8heads_56k           |    109.4  |      103.2
      3batch-1keys-16heads-mq_52k       |    109.7  |      112.9
      3batch-1keys-16heads_112k         |    111.3  |      103.0
      3batch-1keys-64heads-mq_196k      |    166.6  |      114.1
      3batch-1keys-64heads_448k         |    169.5  |      103.5
      500batch-7keys-8heads-mq_12412k   |   2997.9  |      238.7
      500batch-7keys-8heads_71296k      |   3248.9  |      224.1
      500batch-7keys-16heads-mq_16412k  |   5496.0  |      472.3
      500batch-7keys-16heads_142592k    |   6113.5  |      441.1
      500batch-7keys-64heads-mq_40412k  |  21284.6  |     1889.1
      500batch-7keys-64heads_570368k    |  22773.1  |     1815.3
      2batch-543keys-8heads-mq_627k     |    332.3  |      110.2
      2batch-543keys-8heads_4904k       |    342.9  |      102.4
      2batch-543keys-16heads-mq_643k    |    333.3  |      109.5
      2batch-543keys-16heads_9808k      |    341.2  |      102.9
      2batch-543keys-64heads-mq_739k    |    413.6  |      110.2
      2batch-543keys-64heads_39232k     |    474.9  |      105.8
      1batch-5543keys-8heads-mq_5551k   |   2770.3  |      218.6
      1batch-5543keys-8heads_44352k     |   2772.2  |      246.4
      1batch-5543keys-16heads-mq_5559k  |   2768.3  |      217.7
      1batch-5543keys-16heads_88704k    |   2811.3  |      249.2
      1batch-5543keys-64heads-mq_5607k  |   3361.0  |      217.9
      1batch-5543keys-64heads_354816k   |   3997.4  |      313.5
      32batch-103keys-8heads-mq_4666k   |    421.7  |      111.1
      32batch-103keys-8heads_35536k     |    451.4  |      101.2
      32batch-103keys-16heads-mq_4922k  |    682.9  |      109.3
      32batch-103keys-16heads_71072k    |    741.5  |      103.9
      32batch-103keys-64heads-mq_6458k  |   2366.3  |      242.8
      32batch-103keys-64heads_284288k   |   2673.1  |      467.5
      4batch-1127keys-8heads-mq_4775k   |    755.2  |      111.4
      4batch-1127keys-8heads_37976k     |    780.0  |      104.9
      4batch-1127keys-16heads-mq_4807k  |    825.1  |      109.2
      4batch-1127keys-16heads_75952k    |    965.7  |      103.4
      4batch-1127keys-64heads-mq_4999k  |   2248.4  |      185.6
      4batch-1127keys-64heads_303808k   |   2607.5  |      319.4
      1batch-7271keys-8heads-mq_7279k   |   3585.6  |      291.2
      1batch-7271keys-8heads_58176k     |   3575.9  |      320.7
      1batch-7271keys-16heads-mq_7287k  |   3584.6  |      290.9
      1batch-7271keys-16heads_116352k   |   3628.6  |      322.3
      1batch-7271keys-64heads-mq_7335k  |   4353.3  |      288.0
      1batch-7271keys-64heads_465408k   |   5175.1  |      412.4

Times are in microseconds (us).

[-------------------- cuda graphed attention --------------------]
                                        |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------------
      3batch-1keys-8heads-mq_28k        |     87.2  |       13.6
      3batch-1keys-8heads_56k           |     85.8  |       13.4
      3batch-1keys-16heads-mq_52k       |     86.6  |       13.2
      3batch-1keys-16heads_112k         |     85.7  |       13.6
      3batch-1keys-64heads-mq_196k      |    165.3  |       17.8
      3batch-1keys-64heads_448k         |    169.0  |       17.7
      500batch-7keys-8heads-mq_12412k   |   3145.6  |      242.8
      500batch-7keys-8heads_71296k      |   3183.4  |      228.9
      500batch-7keys-16heads-mq_16412k  |   5516.5  |      480.3
      500batch-7keys-16heads_142592k    |   6015.4  |      445.5
      500batch-7keys-64heads-mq_40412k  |  21194.5  |     1888.4
      500batch-7keys-64heads_570368k    |  22632.4  |     1815.8
      2batch-543keys-8heads-mq_627k     |    330.9  |       34.2
      2batch-543keys-8heads_4904k       |    340.1  |       35.0
      2batch-543keys-16heads-mq_643k    |    331.3  |       34.2
      2batch-543keys-16heads_9808k      |    341.8  |       36.8
      2batch-543keys-64heads-mq_739k    |    413.5  |       59.9
      2batch-543keys-64heads_39232k     |    474.7  |       69.1
      1batch-5543keys-8heads-mq_5551k   |   2766.0  |      222.7
      1batch-5543keys-8heads_44352k     |   2769.9  |      250.1
      1batch-5543keys-16heads-mq_5559k  |   2765.5  |      222.3
      1batch-5543keys-16heads_88704k    |   2812.5  |      253.3
      1batch-5543keys-64heads-mq_5607k  |   3360.6  |      222.4
      1batch-5543keys-64heads_354816k   |   3996.1  |      314.2
      32batch-103keys-8heads-mq_4666k   |    421.4  |       44.7
      32batch-103keys-8heads_35536k     |    452.7  |       53.5
      32batch-103keys-16heads-mq_4922k  |    681.8  |       72.0
      32batch-103keys-16heads_71072k    |    743.4  |       88.6
      32batch-103keys-64heads-mq_6458k  |   2367.6  |      247.2
      32batch-103keys-64heads_284288k   |   2666.3  |      476.3
      4batch-1127keys-8heads-mq_4775k   |    755.6  |       68.7
      4batch-1127keys-8heads_37976k     |    788.4  |       73.9
      4batch-1127keys-16heads-mq_4807k  |    825.4  |       69.4
      4batch-1127keys-16heads_75952k    |    964.9  |       79.0
      4batch-1127keys-64heads-mq_4999k  |   2246.2  |      190.2
      4batch-1127keys-64heads_303808k   |   2600.2  |      324.3
      1batch-7271keys-8heads-mq_7279k   |   3583.5  |      296.4
      1batch-7271keys-8heads_58176k     |   3573.8  |      325.0
      1batch-7271keys-16heads-mq_7287k  |   3578.7  |      292.8
      1batch-7271keys-16heads_116352k   |   3627.5  |      331.0
      1batch-7271keys-64heads-mq_7335k  |   4353.3  |      293.2
      1batch-7271keys-64heads_465408k   |   5177.3  |      414.8

Times are in microseconds (us).
```
---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index a98fbe804..cee82dde5 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -37,7 +37,7 @@ __device__ void inner_product<bhalf4_t, bhalf4_t, float>(
 namespace {
 
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock = 8;
+constexpr int32_t kWavefrontsPerBlock = 16;
 constexpr int32_t D_H = 4 * kThreadsPerWavefront;
 constexpr int32_t T_MAX = 8192;
 
@@ -276,9 +276,10 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](float a, float b) { return a + b; });
 
+  const double softmax_scale_factor = 1. / softmax_denominator;
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    smem[t] = expf(smem[t] - max_qk_acc) / softmax_denominator;
+    smem[t] = expf(smem[t] - max_qk_acc) * softmax_scale_factor;
   }
   __syncthreads();
 

From 21c569d780b25d07a94558070181d038ac073e8b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 17:37:08 -0400
Subject: [PATCH 176/641] set loop unroll = 16 for loading k and v by 4-element
 chunks

```
Times are in microseconds (us).

[-------------------------- attention ---------------------------]
                                        |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------------
      3batch-1keys-8heads-mq_28k        |     84.2  |       90.0
      3batch-1keys-8heads_56k           |     88.5  |       82.6
      3batch-1keys-16heads-mq_52k       |     89.5  |       89.3
      3batch-1keys-16heads_112k         |     89.5  |       83.7
      3batch-1keys-64heads-mq_196k      |    163.8  |       91.2
      3batch-1keys-64heads_448k         |    168.8  |       87.1
      500batch-7keys-8heads-mq_12412k   |   3004.0  |      239.3
      500batch-7keys-8heads_71296k      |   3100.3  |      225.5
      500batch-7keys-16heads-mq_16412k  |   5573.5  |      474.1
      500batch-7keys-16heads_142592k    |   5854.0  |      443.9
      500batch-7keys-64heads-mq_40412k  |  20998.9  |     1892.5
      500batch-7keys-64heads_570368k    |  22455.8  |     1820.7
      2batch-543keys-8heads-mq_627k     |    331.9  |       89.7
      2batch-543keys-8heads_4904k       |    343.8  |       85.7
      2batch-543keys-16heads-mq_643k    |    329.5  |       94.2
      2batch-543keys-16heads_9808k      |    342.6  |       84.4
      2batch-543keys-64heads-mq_739k    |    416.0  |       88.7
      2batch-543keys-64heads_39232k     |    472.5  |       83.6
      1batch-5543keys-8heads-mq_5551k   |   2756.4  |      206.7
      1batch-5543keys-8heads_44352k     |   2769.7  |      229.5
      1batch-5543keys-16heads-mq_5559k  |   2758.3  |      205.8
      1batch-5543keys-16heads_88704k    |   2812.1  |      231.5
      1batch-5543keys-64heads-mq_5607k  |   3361.0  |      205.7
      1batch-5543keys-64heads_354816k   |   3997.1  |      309.1
      32batch-103keys-8heads-mq_4666k   |    417.4  |       91.0
      32batch-103keys-8heads_35536k     |    452.1  |       84.3
      32batch-103keys-16heads-mq_4922k  |    681.0  |       90.5
      32batch-103keys-16heads_71072k    |    739.7  |       92.8
      32batch-103keys-64heads-mq_6458k  |   2361.1  |      266.8
      32batch-103keys-64heads_284288k   |   2665.7  |      458.7
      4batch-1127keys-8heads-mq_4775k   |    744.7  |       91.0
      4batch-1127keys-8heads_37976k     |    775.4  |       85.9
      4batch-1127keys-16heads-mq_4807k  |    823.8  |       90.5
      4batch-1127keys-16heads_75952k    |    963.7  |       86.3
      4batch-1127keys-64heads-mq_4999k  |   2245.7  |      180.7
      4batch-1127keys-64heads_303808k   |   2598.2  |      331.0
      1batch-7271keys-8heads-mq_7279k   |   3561.0  |      271.6
      1batch-7271keys-8heads_58176k     |   3575.8  |      292.2
      1batch-7271keys-16heads-mq_7287k  |   3581.9  |      269.7
      1batch-7271keys-16heads_116352k   |   3636.7  |      295.5
      1batch-7271keys-64heads-mq_7335k  |   4351.9  |      269.3
      1batch-7271keys-64heads_465408k   |   5177.1  |      384.2

Times are in microseconds (us).

[-------------------- cuda graphed attention --------------------]
                                        |    ckF    |  ck_decoderF
1 threads: -------------------------------------------------------
      3batch-1keys-8heads-mq_28k        |     86.9  |       13.3
      3batch-1keys-8heads_56k           |     86.9  |       13.3
      3batch-1keys-16heads-mq_52k       |     86.5  |       13.3
      3batch-1keys-16heads_112k         |     88.4  |       13.3
      3batch-1keys-64heads-mq_196k      |    164.7  |       17.7
      3batch-1keys-64heads_448k         |    168.9  |       17.9
      500batch-7keys-8heads-mq_12412k   |   2999.4  |      244.2
      500batch-7keys-8heads_71296k      |   3102.8  |      230.6
      500batch-7keys-16heads-mq_16412k  |   5563.8  |      478.8
      500batch-7keys-16heads_142592k    |   5849.0  |      448.5
      500batch-7keys-64heads-mq_40412k  |  20937.4  |     1896.1
      500batch-7keys-64heads_570368k    |  22384.2  |     1825.3
      2batch-543keys-8heads-mq_627k     |    329.2  |       34.1
      2batch-543keys-8heads_4904k       |    341.2  |       35.1
      2batch-543keys-16heads-mq_643k    |    330.1  |       34.1
      2batch-543keys-16heads_9808k      |    343.5  |       36.8
      2batch-543keys-64heads-mq_739k    |    412.7  |       60.0
      2batch-543keys-64heads_39232k     |    473.5  |       69.6
      1batch-5543keys-8heads-mq_5551k   |   2759.0  |      211.4
      1batch-5543keys-8heads_44352k     |   2769.7  |      232.8
      1batch-5543keys-16heads-mq_5559k  |   2796.2  |      211.0
      1batch-5543keys-16heads_88704k    |   2812.2  |      234.9
      1batch-5543keys-64heads-mq_5607k  |   3358.5  |      211.0
      1batch-5543keys-64heads_354816k   |   3998.3  |      310.8
      32batch-103keys-8heads-mq_4666k   |    418.8  |       48.2
      32batch-103keys-8heads_35536k     |    450.3  |       59.4
      32batch-103keys-16heads-mq_4922k  |    683.6  |       78.2
      32batch-103keys-16heads_71072k    |    740.4  |       98.0
      32batch-103keys-64heads-mq_6458k  |   2363.7  |      271.4
      32batch-103keys-64heads_284288k   |   2665.3  |      460.0
      4batch-1127keys-8heads-mq_4775k   |    745.7  |       67.6
      4batch-1127keys-8heads_37976k     |    776.8  |       74.8
      4batch-1127keys-16heads-mq_4807k  |    824.2  |       67.7
      4batch-1127keys-16heads_75952k    |    963.0  |       89.5
      4batch-1127keys-64heads-mq_4999k  |   2246.0  |      185.3
      4batch-1127keys-64heads_303808k   |   2598.0  |      336.3
      1batch-7271keys-8heads-mq_7279k   |   3573.1  |      276.6
      1batch-7271keys-8heads_58176k     |   3577.0  |      296.0
      1batch-7271keys-16heads-mq_7287k  |   3572.6  |      278.4
      1batch-7271keys-16heads_116352k   |   3634.4  |      299.3
      1batch-7271keys-64heads-mq_7335k  |   4353.4  |      274.2
      1batch-7271keys-64heads_465408k   |   5172.3  |      384.6

Times are in microseconds (us).
```
---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index cee82dde5..a3d657354 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -124,7 +124,7 @@ __forceinline__ __device__ void store_v(
 
 template <
     typename scalar_t,
-    int32_t n_loop_unroll = 4,
+    int32_t n_loop_unroll = 16,
     int32_t n_loop_unroll_tail = 2>
 __global__ void efficient_attention_forward_decoder_ck_kernel(
     at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,

From f8dba5a540df808975f02d4767d213f53a0dffa8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 17:53:26 -0400
Subject: [PATCH 177/641] add comment about how to get assembly

---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index a3d657354..237516ab5 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -535,6 +535,8 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 -lamdhip64 \
 -o a.out
 
+For assembly debugging, add `--save-temps -g`.
+
 (3a) run correctness check
  > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
  ./a.out

From c5406c231477452592bde1242dd652deaa2b7dbd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 18:37:17 -0400
Subject: [PATCH 178/641] vectorize register->smem storing of qk inner products

---
 .../hip_fmha/attention_forward_decoder.cpp    | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 237516ab5..8cc79ad7c 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -178,7 +178,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   data_vec4_t k_loads[n_loop_unroll];
 
-  const auto dtt = wavefronts_per_block * n_loop_unroll;
+  constexpr auto dtt = kWavefrontsPerBlock * n_loop_unroll;
   const int32_t t_max_unroll = (t_max / dtt) * dtt;
 
   for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
@@ -189,21 +189,23 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       load_v<decltype(cache_K_base), data_vec4_t>(
           cache_K_base + t * cache_K.stride(1), lane_idx, &k_loads[ttt]);
     }
+    float qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      float qk_acc = 0;
       const int32_t t = tt + ttt;
 
       ck::inner_product<data_vec4_t, data_vec4_t, float>(
-          q_thread, k_loads[ttt], qk_acc);
-      qk_acc *= qk_scale;
-
-      qk_acc = wavefrontReduce(qk_acc, [](float a, float b) { return a + b; });
-      max_qk_acc = max(qk_acc, max_qk_acc);
+          q_thread, k_loads[ttt], qk_accs[ttt]);
+      qk_accs[ttt] *= qk_scale;
 
-      // write accumulated sums to smem.
-      if (lane_idx == 0) {
-        smem[t] = qk_acc;
+      qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](float a, float b) { return a + b; });
+      max_qk_acc = max(qk_accs[ttt], max_qk_acc);
+    }
+    if (lane_idx == 0) {
+      auto* smem_base = smem + tt;
+ #pragma unroll n_loop_unroll
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        smem_base[ttt] = qk_accs[ttt];
       }
     }
   }

From cb86fa7e13721a19286dac4cb6ebfabe1a66bbfb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Oct 2023 18:51:26 -0400
Subject: [PATCH 179/641] remove unused index

---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 8cc79ad7c..a6f12f402 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -192,8 +192,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     float qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      const int32_t t = tt + ttt;
-
       ck::inner_product<data_vec4_t, data_vec4_t, float>(
           q_thread, k_loads[ttt], qk_accs[ttt]);
       qk_accs[ttt] *= qk_scale;

From 9337801c44471a01b84041cd077e4e0635907c94 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 18 Oct 2023 00:00:57 -0400
Subject: [PATCH 180/641] remove internal double, replace expf with intrinsic

---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index a6f12f402..cd1834831 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -257,7 +257,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // each wavefront computes partial sum of exp.
   float softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    softmax_denominator += expf(smem[t] - max_qk_acc);
+    softmax_denominator += __expf(smem[t] - max_qk_acc);
   }
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](float a, float b) { return a + b; });
@@ -276,10 +276,10 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](float a, float b) { return a + b; });
 
-  const double softmax_scale_factor = 1. / softmax_denominator;
+  const float softmax_scale_factor = 1. / softmax_denominator;
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    smem[t] = expf(smem[t] - max_qk_acc) * softmax_scale_factor;
+    smem[t] = __expf(smem[t] - max_qk_acc) * softmax_scale_factor;
   }
   __syncthreads();
 

From ae47ed3d8d85fb3d912a22fadbb80f033ffd1e3b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 18 Oct 2023 14:11:14 -0400
Subject: [PATCH 181/641] fix standalone shapes

---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index cd1834831..ca05f694a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -600,10 +600,10 @@ int main(int argc, char** argv) {
                      .requires_grad(false);
 
     const auto int_options = options.dtype(torch::kInt);  
-    const auto Q = at::rand({1, batch_size, n_heads, dim_per_head}, options);
+    const auto Q = at::rand({batch_size, 1, n_heads, dim_per_head}, options);
     const auto K = multiquery 
-      ? at::rand({1, batch_size * padding, 1, dim_per_head}, options).expand({1, batch_size * padding, n_heads, dim_per_head})
-      : at::rand({1, batch_size * padding, 1, dim_per_head}, options);
+      ? at::rand({batch_size, padding, 1, dim_per_head}, options).expand({batch_size, padding, n_heads, dim_per_head})
+      : at::rand({batch_size, padding, n_heads, dim_per_head}, options);
     const auto V = at::rand_like(K);
     auto O = at::rand_like(Q);
 

From 0ff6b03d83dcdd28e06b1fc17676b1ba8aedee0f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 25 Oct 2023 16:39:16 -0400
Subject: [PATCH 182/641] update instruction

---
 xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index ca05f694a..29d5c157d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -484,6 +484,9 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 (1) hipify
  > pip install -e /xformers
+
+ For obtaining all the library paths needed for compilation below, add `--verbose`.
+ 
 (2) compile
  > /opt/rocm/bin/hipcc \
 -I/xformers/xformers/csrc \

From b84dbec57cd1b34b591887f5a453db71819283e0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:27:00 -0400
Subject: [PATCH 183/641] refactor tensor accessor out of the kernel arguments
 (currently it breaks one of benchmark cases for some reason; tests are good)

---
 .../hip_fmha/attention_forward_decoder.cpp    | 61 +++++++++++++------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 29d5c157d..6dbd1f416 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -127,11 +127,17 @@ template <
     int32_t n_loop_unroll = 16,
     int32_t n_loop_unroll_tail = 2>
 __global__ void efficient_attention_forward_decoder_ck_kernel(
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ,
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K,
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V,
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions,
+    const scalar_t* __restrict__ XQ,
+    const scalar_t* __restrict__ cache_K,
+    const scalar_t* __restrict__ cache_V,
+    scalar_t* __restrict__ O,
+    const int32_t* __restrict__ seq_positions,
+    const int32_t XQ_stride_0,
+    const int32_t XQ_stride_2,
+    const int32_t K_stride_0,
+    const int32_t K_stride_1,
+    const int32_t K_stride_2,
+    const bool multiquery,
     const float qk_scale) {
   static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
@@ -157,11 +163,14 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       lane_idx + wavefront_idx * threads_per_wavefront;
 
   // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
-  const auto* q_ = &(XQ[b][0][h][0]);
+  // const auto* q_ = &(XQ[b][0][h][0]);
+  const auto* q_ = XQ + b * XQ_stride_0 + h * XQ_stride_2;
 
-  const bool multiquery = cache_K.size(2) == 1;
-  const auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
-  const auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
+  // const bool multiquery = cache_K.size(2) == 1;
+  // const auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
+  const auto* cache_K_base = cache_K + b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+  // const auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
+  const auto* cache_V_base = cache_V + b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
@@ -187,7 +196,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       const int32_t t = tt + ttt;
       // load the K[b][t][h|0][:] row into registers
       load_v<decltype(cache_K_base), data_vec4_t>(
-          cache_K_base + t * cache_K.stride(1), lane_idx, &k_loads[ttt]);
+          cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
     }
     float qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
@@ -217,7 +226,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       if (t < t_max) {
         // load the K[b][t][h|0][:] row into registers
         load_v<decltype(cache_K_base), data_vec4_t>(
-            cache_K_base + t * cache_K.stride(1), lane_idx, &k_loads[ttt]);
+            cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       }
     }
 #pragma unroll n_loop_unroll_tail
@@ -297,7 +306,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       const int32_t t = tt + ttt;
       // load the V[b][t][h|0][:] row into registers, reusing K register storage
       load_v<decltype(cache_V_base), data_vec4_t>(
-          cache_V_base + t * cache_V.stride(1), lane_idx, &k_loads[ttt]);
+          cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       ps[ttt] = smem[t];
     }
 
@@ -316,7 +325,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
         // load the V[b][t][h|0][:] row into registers, reusing K register
         // storage
         load_v<decltype(cache_V_base), data_vec4_t>(
-            cache_V_base + t * cache_V.stride(1), lane_idx, &k_loads[ttt]);
+            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
     }
@@ -352,7 +361,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     bf_r.y = ck::type_convert<data_t>(r.y);
     bf_r.z = ck::type_convert<data_t>(r.z);
     bf_r.w = ck::type_convert<data_t>(r.w);
-    auto* o_ = &O[b][0][h][0];
+    // auto* o_ = &O[b][0][h][0];
+    auto* o_ = O + b * XQ_stride_0 + h * XQ_stride_2;
     store_v<decltype(o_), data_vec4_t>(o_, lane_idx, bf_r);
   }
 }
@@ -427,13 +437,24 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
         auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
         update_max_dynamic_shared_memory_size_bytes(
             reinterpret_cast<void*&>(kernel), smem_size);
+        auto XQ_acc = XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
+        auto K_acc = cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+        auto V_acc = cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+        auto O_acc = O.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+        auto seq_acc = seq_positions
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
         kernel<<<blocks, threads, smem_size, stream>>>(
-            XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-            cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-            cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-            O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-            seq_positions
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            XQ_acc.data(),
+            K_acc.data(),
+            V_acc.data(),
+            O_acc.data(),
+            seq_acc.data(),
+            XQ_acc.stride(0),
+            XQ_acc.stride(2),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.size(2) == 1,
             qk_scale);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });

From 4478c25cff13562ed42cee2ad3fac4596419df8b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 2 Nov 2023 21:57:39 -0400
Subject: [PATCH 184/641] roll back kernel signature change; something
 currently unexplainable prevents from offset calculation on V tensor

---
 .../hip_fmha/attention_forward_decoder.cpp    | 69 +++++++++----------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 6dbd1f416..81666ae2e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -127,20 +127,29 @@ template <
     int32_t n_loop_unroll = 16,
     int32_t n_loop_unroll_tail = 2>
 __global__ void efficient_attention_forward_decoder_ck_kernel(
-    const scalar_t* __restrict__ XQ,
-    const scalar_t* __restrict__ cache_K,
-    const scalar_t* __restrict__ cache_V,
-    scalar_t* __restrict__ O,
-    const int32_t* __restrict__ seq_positions,
-    const int32_t XQ_stride_0,
-    const int32_t XQ_stride_2,
-    const int32_t K_stride_0,
-    const int32_t K_stride_1,
-    const int32_t K_stride_2,
-    const bool multiquery,
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ_acc,
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K_acc,
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V_acc,
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O_acc,
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions_acc,
     const float qk_scale) {
   static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
+  const scalar_t* __restrict__ XQ = XQ_acc.data();
+  const scalar_t* __restrict__ cache_K = cache_K_acc.data();
+  const scalar_t* __restrict__ cache_V = cache_V_acc.data();
+  scalar_t* __restrict__ O = O_acc.data();
+  const int32_t* __restrict__ seq_positions = seq_positions_acc.data();
+  const int32_t XQ_stride_0 = XQ_acc.stride(0);
+  const int32_t XQ_stride_2 = XQ_acc.stride(2);
+  const int32_t K_stride_0 = cache_K_acc.stride(0);
+  const int32_t K_stride_1 = cache_K_acc.stride(1);
+  const int32_t K_stride_2 = cache_K_acc.stride(2);
+  const int32_t V_stride_0 = cache_V_acc.stride(0); // cache_V strides should be the same as cache_K strides
+  const int32_t V_stride_1 = cache_V_acc.stride(1);
+  const int32_t V_stride_2 = cache_V_acc.stride(2);
+  const bool multiquery = cache_K_acc.size(2) == 1;
+
   constexpr int32_t seq_positions_shift = 0;
 
   extern __shared__ __align__(16) float smem[];
@@ -163,14 +172,15 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       lane_idx + wavefront_idx * threads_per_wavefront;
 
   // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
-  // const auto* q_ = &(XQ[b][0][h][0]);
+  // const auto* q_ = &(XQ_acc[b][0][h][0]);
   const auto* q_ = XQ + b * XQ_stride_0 + h * XQ_stride_2;
 
   // const bool multiquery = cache_K.size(2) == 1;
-  // const auto* cache_K_base = &cache_K[b][0][multiquery ? 0 : h][0];
-  const auto* cache_K_base = cache_K + b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
-  // const auto* cache_V_base = &cache_V[b][0][multiquery ? 0 : h][0];
-  const auto* cache_V_base = cache_V + b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+  // const auto* cache_K_base = &cache_K_acc[b][0][multiquery ? 0 : h][0];
+  const auto cache_KV_base_offset = b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+  const auto* cache_K_base = cache_K + cache_KV_base_offset;
+  const auto* cache_V_base = &cache_V_acc[b][0][multiquery ? 0 : h][0];
+  // const auto* cache_V_base = cache_V + cache_KV_base_offset; // invalid memory access error
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
@@ -306,7 +316,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       const int32_t t = tt + ttt;
       // load the V[b][t][h|0][:] row into registers, reusing K register storage
       load_v<decltype(cache_V_base), data_vec4_t>(
-          cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+          cache_V_base + t * V_stride_1, lane_idx, &k_loads[ttt]);
       ps[ttt] = smem[t];
     }
 
@@ -325,7 +335,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
         // load the V[b][t][h|0][:] row into registers, reusing K register
         // storage
         load_v<decltype(cache_V_base), data_vec4_t>(
-            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+            cache_V_base + t * V_stride_1, lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
     }
@@ -437,24 +447,13 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
         auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
         update_max_dynamic_shared_memory_size_bytes(
             reinterpret_cast<void*&>(kernel), smem_size);
-        auto XQ_acc = XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
-        auto K_acc = cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-        auto V_acc = cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-        auto O_acc = O.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-        auto seq_acc = seq_positions
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
         kernel<<<blocks, threads, smem_size, stream>>>(
-            XQ_acc.data(),
-            K_acc.data(),
-            V_acc.data(),
-            O_acc.data(),
-            seq_acc.data(),
-            XQ_acc.stride(0),
-            XQ_acc.stride(2),
-            K_acc.stride(0),
-            K_acc.stride(1),
-            K_acc.stride(2),
-            K_acc.size(2) == 1,
+            XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+            cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+            cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+            O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+            seq_positions
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
             qk_scale);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });

From 182273c57e3b3bae76daf09e3968650c72c6745c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 2 Nov 2023 17:55:21 -0400
Subject: [PATCH 185/641] wrap kernel call into DeviceOp api

---
 .../hip_fmha/attention_forward_decoder.cpp    | 191 +++++++++++++-----
 1 file changed, 136 insertions(+), 55 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 81666ae2e..664c68139 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -7,6 +7,9 @@
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
 #include <c10/cuda/CUDAStream.h>
+#include <ck/host_utility/kernel_launch_hip.hpp>
+#include <ck/stream_config.hpp>
+#include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/inner_product.hpp>
 #include <torch/library.h>
@@ -215,12 +218,13 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
           q_thread, k_loads[ttt], qk_accs[ttt]);
       qk_accs[ttt] *= qk_scale;
 
-      qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](float a, float b) { return a + b; });
+      qk_accs[ttt] =
+          wavefrontReduce(qk_accs[ttt], [](float a, float b) { return a + b; });
       max_qk_acc = max(qk_accs[ttt], max_qk_acc);
     }
     if (lane_idx == 0) {
       auto* smem_base = smem + tt;
- #pragma unroll n_loop_unroll
+#pragma unroll n_loop_unroll
       for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
         smem_base[ttt] = qk_accs[ttt];
       }
@@ -377,21 +381,73 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   }
 }
 
-void update_max_dynamic_shared_memory_size_bytes(
-    void* kernel_func,
-    int32_t new_value) {
-  hipFuncAttributes attributes;
-  C10_CUDA_CHECK(hipFuncGetAttributes(&attributes, kernel_func));
-
-  const auto default_value = attributes.maxDynamicSharedSizeBytes;
+} // namespace
 
-  // printf("Default smem size: %d\n", default_value);
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename scalar_t>
+struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+  struct Argument : public BaseArgument {
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ_acc;
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K_acc;
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V_acc;
+    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O_acc;
+    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions_acc;
+    const float qk_scale;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ_acc,
+        at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K_acc,
+        at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V_acc,
+        at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O_acc,
+        at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions_acc,
+        const float qk_scale,
+
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ_acc(XQ_acc),
+          cache_K_acc(cache_K_acc),
+          cache_V_acc(cache_V_acc),
+          O_acc(O_acc),
+          seq_positions_acc(seq_positions_acc),
+          qk_scale(qk_scale),
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+  };
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
+      return launch_and_time_kernel(
+          stream_config,
+          kernel,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ_acc,
+          arg.cache_K_acc,
+          arg.cache_V_acc,
+          arg.O_acc,
+          arg.seq_positions_acc,
+          arg.qk_scale);
+    }
+  };
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
 
-  if (new_value > default_value) {
-    C10_CUDA_CHECK(hipFuncSetAttribute(
-        kernel_func, hipFuncAttributeMaxDynamicSharedMemorySize, new_value));
-  }
-}
+namespace {
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
   AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
@@ -434,7 +490,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
   int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
   int32_t smem_output = D_H * sizeof(float) *
       threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-  int32_t smem_size = max(smem_softmax, smem_output);
+  const size_t lds_bytes = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
 
   AT_DISPATCH_SWITCH_3(
@@ -444,18 +500,23 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
       XQ.scalar_type(),
       "efficient_attention_forward_decoder_ck",
       [&] {
-        auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
-        update_max_dynamic_shared_memory_size_bytes(
-            reinterpret_cast<void*&>(kernel), smem_size);
-        kernel<<<blocks, threads, smem_size, stream>>>(
+        using device_op_t = ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<
+            scalar_t>;
+        auto op = device_op_t{};
+        auto arg = device_op_t::Argument(
             XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
             cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
             cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
             O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
             seq_positions
                 .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
-            qk_scale);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
+            qk_scale,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
       });
 
   return O;
@@ -472,9 +533,9 @@ at::Tensor efficient_attention_forward_decoder_ck_impl(
     const at::Tensor& seq_positions, // [B]
     double qk_scale) {
   auto O = at::empty_like(XQ);
-  efficient_attention_forward_decoder_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
-    XQ, cache_K, cache_V, seq_positions, qk_scale, O
-  );
+  efficient_attention_forward_decoder_ck_out_impl<
+      ThreadsPerWavefront,
+      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_positions, qk_scale, O);
   return O;
 }
 
@@ -505,19 +566,24 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 (1) hipify
  > pip install -e /xformers
 
- For obtaining all the library paths needed for compilation below, add `--verbose`.
- 
+ For obtaining all the library paths needed for compilation below, add
+`--verbose`.
+
 (2) compile
  > /opt/rocm/bin/hipcc \
 -I/xformers/xformers/csrc \
 -I/xformers/xformers/csrc/attention/hip_fmha \
 -I/xformers/third_party/composable_kernel/include \
 -I/xformers/third_party/composable_kernel/include/ck \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device
+\
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl
+\
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element
+\
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include \
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include
+\
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/TH \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THC \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THH \
@@ -561,12 +627,17 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 For assembly debugging, add `--save-temps -g`.
 
 (3a) run correctness check
- > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
+ >
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
+\
  ./a.out
 
 (3b) run specific input shape
- > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
- ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block
+ >
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
+\
+ ./a.out n_keys padding batch_size n_heads is_multiquery dtype
+n_wavefronts_per_block
 */
 
 static void do_correctness_check() {
@@ -603,7 +674,9 @@ int main(int argc, char** argv) {
   } else {
     const auto args = std::vector<std::string>(argv + 1, argv + argc);
     if (args.size() != 7) {
-      std::cout << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block" << std::endl;
+      std::cout
+          << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block"
+          << std::endl;
       return 0;
     }
     const int32_t n_keys = std::stoi(args[0]);
@@ -611,35 +684,42 @@ int main(int argc, char** argv) {
     const int32_t batch_size = std::stoi(args[2]);
     const int32_t n_heads = std::stoi(args[3]);
     const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32") ? torch::kFloat32 : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[5] == "f32") ? torch::kFloat32
+        : (args[5] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[6]);
-    
+
     const int32_t dim_per_head = 4 * kThreadsPerWavefront;
 
     const auto options = torch::TensorOptions()
-                     .dtype(dtype)
-                     .layout(torch::kStrided)
-                     .device(torch::kCUDA, 1)
-                     .requires_grad(false);
+                             .dtype(dtype)
+                             .layout(torch::kStrided)
+                             .device(torch::kCUDA, 1)
+                             .requires_grad(false);
 
-    const auto int_options = options.dtype(torch::kInt);  
+    const auto int_options = options.dtype(torch::kInt);
     const auto Q = at::rand({batch_size, 1, n_heads, dim_per_head}, options);
-    const auto K = multiquery 
-      ? at::rand({batch_size, padding, 1, dim_per_head}, options).expand({batch_size, padding, n_heads, dim_per_head})
-      : at::rand({batch_size, padding, n_heads, dim_per_head}, options);
+    const auto K = multiquery
+        ? at::rand({batch_size, padding, 1, dim_per_head}, options)
+              .expand({batch_size, padding, n_heads, dim_per_head})
+        : at::rand({batch_size, padding, n_heads, dim_per_head}, options);
     const auto V = at::rand_like(K);
     auto O = at::rand_like(Q);
 
     const auto seq = at::randint(1, n_keys, {batch_size}, int_options);
     const double qk_scale = 1. / sqrt(dim_per_head);
-    auto call_ptr = decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, kWavefrontsPerBlock>) {};
-    
-    #define SWITCH_CASE_SET_CALLPTR(n) \
-    case (n): \
-      call_ptr = &efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, (n)>; \
-      break; 
-
-    switch(n_wavefronts_per_block) {
+    auto call_ptr = decltype(&efficient_attention_forward_decoder_ck_out_impl<
+                             kThreadsPerWavefront,
+                             kWavefrontsPerBlock>){};
+
+#define SWITCH_CASE_SET_CALLPTR(n)                               \
+  case (n):                                                      \
+    call_ptr = &efficient_attention_forward_decoder_ck_out_impl< \
+        kThreadsPerWavefront,                                    \
+        (n)>;                                                    \
+    break;
+
+    switch (n_wavefronts_per_block) {
       SWITCH_CASE_SET_CALLPTR(1);
       SWITCH_CASE_SET_CALLPTR(2);
       SWITCH_CASE_SET_CALLPTR(4);
@@ -650,12 +730,13 @@ int main(int argc, char** argv) {
         call_ptr = nullptr;
         break;
     }
-    #undef SWITCH_CASE_SET_CALLPTR
+#undef SWITCH_CASE_SET_CALLPTR
 
     if (call_ptr) {
       call_ptr(Q, K, V, seq, qk_scale, O);
     } else {
-      std::cout << "Warning: no kernel was found for wavefronts_per_block=" << n_wavefronts_per_block << std::endl;
+      std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                << n_wavefronts_per_block << std::endl;
     }
   }
   return 0;

From 60a9872370f4a53f963d5b032db47d82cc155a09 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 2 Nov 2023 22:58:18 -0400
Subject: [PATCH 186/641] fix; offsets into a tensor need to use ptrdiff_t to
 avoid overflow

---
 .../hip_fmha/attention_forward_decoder.cpp    | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 664c68139..d8bf51ca3 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -143,14 +143,11 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const scalar_t* __restrict__ cache_V = cache_V_acc.data();
   scalar_t* __restrict__ O = O_acc.data();
   const int32_t* __restrict__ seq_positions = seq_positions_acc.data();
-  const int32_t XQ_stride_0 = XQ_acc.stride(0);
-  const int32_t XQ_stride_2 = XQ_acc.stride(2);
-  const int32_t K_stride_0 = cache_K_acc.stride(0);
-  const int32_t K_stride_1 = cache_K_acc.stride(1);
-  const int32_t K_stride_2 = cache_K_acc.stride(2);
-  const int32_t V_stride_0 = cache_V_acc.stride(0); // cache_V strides should be the same as cache_K strides
-  const int32_t V_stride_1 = cache_V_acc.stride(1);
-  const int32_t V_stride_2 = cache_V_acc.stride(2);
+  const ptrdiff_t XQ_stride_0 = XQ_acc.stride(0);
+  const ptrdiff_t XQ_stride_2 = XQ_acc.stride(2);
+  const ptrdiff_t K_stride_0 = cache_K_acc.stride(0);
+  const ptrdiff_t K_stride_1 = cache_K_acc.stride(1);
+  const ptrdiff_t K_stride_2 = cache_K_acc.stride(2);
   const bool multiquery = cache_K_acc.size(2) == 1;
 
   constexpr int32_t seq_positions_shift = 0;
@@ -176,14 +173,12 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
   // const auto* q_ = &(XQ_acc[b][0][h][0]);
-  const auto* q_ = XQ + b * XQ_stride_0 + h * XQ_stride_2;
+  const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
+  const auto* q_ = XQ + XQO_base_offset;
 
-  // const bool multiquery = cache_K.size(2) == 1;
-  // const auto* cache_K_base = &cache_K_acc[b][0][multiquery ? 0 : h][0];
   const auto cache_KV_base_offset = b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
   const auto* cache_K_base = cache_K + cache_KV_base_offset;
-  const auto* cache_V_base = &cache_V_acc[b][0][multiquery ? 0 : h][0];
-  // const auto* cache_V_base = cache_V + cache_KV_base_offset; // invalid memory access error
+  const auto* cache_V_base = cache_V + cache_KV_base_offset;
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
@@ -320,7 +315,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       const int32_t t = tt + ttt;
       // load the V[b][t][h|0][:] row into registers, reusing K register storage
       load_v<decltype(cache_V_base), data_vec4_t>(
-          cache_V_base + t * V_stride_1, lane_idx, &k_loads[ttt]);
+          cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       ps[ttt] = smem[t];
     }
 
@@ -339,7 +334,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
         // load the V[b][t][h|0][:] row into registers, reusing K register
         // storage
         load_v<decltype(cache_V_base), data_vec4_t>(
-            cache_V_base + t * V_stride_1, lane_idx, &k_loads[ttt]);
+            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
     }
@@ -375,8 +370,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     bf_r.y = ck::type_convert<data_t>(r.y);
     bf_r.z = ck::type_convert<data_t>(r.z);
     bf_r.w = ck::type_convert<data_t>(r.w);
-    // auto* o_ = &O[b][0][h][0];
-    auto* o_ = O + b * XQ_stride_0 + h * XQ_stride_2;
+    auto* o_ = O + XQO_base_offset;
     store_v<decltype(o_), data_vec4_t>(o_, lane_idx, bf_r);
   }
 }

From fa0e993760b08d362b9753cd7b078592ddafe971 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 2 Nov 2023 23:30:04 -0400
Subject: [PATCH 187/641] refactor the kernel to use raw pointers and strides
 instead of accessors

---
 .../hip_fmha/attention_forward_decoder.cpp    | 160 +++++++++++-------
 1 file changed, 95 insertions(+), 65 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index d8bf51ca3..98031081b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -130,26 +130,20 @@ template <
     int32_t n_loop_unroll = 16,
     int32_t n_loop_unroll_tail = 2>
 __global__ void efficient_attention_forward_decoder_ck_kernel(
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ_acc,
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K_acc,
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V_acc,
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O_acc,
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions_acc,
+    const scalar_t* __restrict__ XQ,
+    const scalar_t* __restrict__ cache_K,
+    const scalar_t* __restrict__ cache_V,
+    scalar_t* __restrict__ O,
+    const int32_t* __restrict__ seq_positions,
+    const ptrdiff_t XQ_stride_0,
+    const ptrdiff_t XQ_stride_2,
+    const ptrdiff_t K_stride_0,
+    const ptrdiff_t K_stride_1,
+    const ptrdiff_t K_stride_2,
+    const bool multiquery,
     const float qk_scale) {
   static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
-  const scalar_t* __restrict__ XQ = XQ_acc.data();
-  const scalar_t* __restrict__ cache_K = cache_K_acc.data();
-  const scalar_t* __restrict__ cache_V = cache_V_acc.data();
-  scalar_t* __restrict__ O = O_acc.data();
-  const int32_t* __restrict__ seq_positions = seq_positions_acc.data();
-  const ptrdiff_t XQ_stride_0 = XQ_acc.stride(0);
-  const ptrdiff_t XQ_stride_2 = XQ_acc.stride(2);
-  const ptrdiff_t K_stride_0 = cache_K_acc.stride(0);
-  const ptrdiff_t K_stride_1 = cache_K_acc.stride(1);
-  const ptrdiff_t K_stride_2 = cache_K_acc.stride(2);
-  const bool multiquery = cache_K_acc.size(2) == 1;
-
   constexpr int32_t seq_positions_shift = 0;
 
   extern __shared__ __align__(16) float smem[];
@@ -176,7 +170,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
   const auto* q_ = XQ + XQO_base_offset;
 
-  const auto cache_KV_base_offset = b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+  const auto cache_KV_base_offset =
+      b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
   const auto* cache_K_base = cache_K + cache_KV_base_offset;
   const auto* cache_V_base = cache_V + cache_KV_base_offset;
 
@@ -384,11 +379,17 @@ template <typename scalar_t>
 struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
   using DeviceOp = FMHADecoderSeqlen1DeviceOp;
   struct Argument : public BaseArgument {
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ_acc;
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K_acc;
-    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V_acc;
-    at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O_acc;
-    at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions_acc;
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    const int32_t* __restrict__ seq_positions;
+    const ptrdiff_t XQ_stride_0;
+    const ptrdiff_t XQ_stride_2;
+    const ptrdiff_t K_stride_0;
+    const ptrdiff_t K_stride_1;
+    const ptrdiff_t K_stride_2;
+    const bool multiquery;
     const float qk_scale;
 
     const dim3 grid_dim;
@@ -396,21 +397,32 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     const size_t lds_bytes;
 
     Argument(
-        at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> XQ_acc,
-        at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_K_acc,
-        at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> cache_V_acc,
-        at::PackedTensorAccessor32<scalar_t, 4, at::RestrictPtrTraits> O_acc,
-        at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> seq_positions_acc,
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        const int32_t* __restrict__ seq_positions,
+        const ptrdiff_t XQ_stride_0,
+        const ptrdiff_t XQ_stride_2,
+        const ptrdiff_t K_stride_0,
+        const ptrdiff_t K_stride_1,
+        const ptrdiff_t K_stride_2,
+        const bool multiquery,
         const float qk_scale,
-
         const dim3 grid_dim,
         const dim3 block_dim,
         const size_t lds_bytes)
-        : XQ_acc(XQ_acc),
-          cache_K_acc(cache_K_acc),
-          cache_V_acc(cache_V_acc),
-          O_acc(O_acc),
-          seq_positions_acc(seq_positions_acc),
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          seq_positions(seq_positions),
+          XQ_stride_0(XQ_stride_0),
+          XQ_stride_2(XQ_stride_2),
+          K_stride_0(K_stride_0),
+          K_stride_1(K_stride_1),
+          K_stride_2(K_stride_2),
+          multiquery(multiquery),
           qk_scale(qk_scale),
           grid_dim(grid_dim),
           block_dim(block_dim),
@@ -421,18 +433,23 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     float Run(
         const Argument& arg,
         const StreamConfig& stream_config = StreamConfig{}) {
-      auto* kernel = &efficient_attention_forward_decoder_ck_kernel<scalar_t>;
       return launch_and_time_kernel(
           stream_config,
-          kernel,
+          efficient_attention_forward_decoder_ck_kernel<scalar_t>,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
-          arg.XQ_acc,
-          arg.cache_K_acc,
-          arg.cache_V_acc,
-          arg.O_acc,
-          arg.seq_positions_acc,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.O,
+          arg.seq_positions,
+          arg.XQ_stride_0,
+          arg.XQ_stride_2,
+          arg.K_stride_0,
+          arg.K_stride_1,
+          arg.K_stride_2,
+          arg.multiquery,
           arg.qk_scale);
     }
   };
@@ -494,16 +511,32 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
       XQ.scalar_type(),
       "efficient_attention_forward_decoder_ck",
       [&] {
-        using device_op_t = ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<
-            scalar_t>;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<scalar_t>;
         auto op = device_op_t{};
-        auto arg = device_op_t::Argument(
-            XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
-            cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-            cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
-            O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>(),
+
+        auto XQ_acc =
+            XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
+        auto K_acc =
+            cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+        auto V_acc =
+            cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+        auto O_acc = O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
+        auto seq_acc =
             seq_positions
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
+        auto arg = device_op_t::Argument(
+            XQ_acc.data(),
+            K_acc.data(),
+            V_acc.data(),
+            O_acc.data(),
+            seq_acc.data(),
+            XQ_acc.stride(0),
+            XQ_acc.stride(2),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.size(2) == 1,
             qk_scale,
             blocks,
             threads,
@@ -555,13 +588,15 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 #include <torch/torch.h>
 
+// clang-format off
+
 /*
 
 (1) hipify
  > pip install -e /xformers
 
- For obtaining all the library paths needed for compilation below, add
-`--verbose`.
+ For obtaining all the library paths needed for compilation below, add `--verbose`.
+ For efficient utilization of CPU cores for compilation use MAX_JOBS env variable.
 
 (2) compile
  > /opt/rocm/bin/hipcc \
@@ -569,15 +604,11 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 -I/xformers/xformers/csrc/attention/hip_fmha \
 -I/xformers/third_party/composable_kernel/include \
 -I/xformers/third_party/composable_kernel/include/ck \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device
-\
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl
-\
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element
-\
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl \
+-I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include
-\
+-I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/TH \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THC \
 -I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THH \
@@ -622,18 +653,17 @@ For assembly debugging, add `--save-temps -g`.
 
 (3a) run correctness check
  >
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
-\
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
  ./a.out
 
 (3b) run specific input shape
  >
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
-\
- ./a.out n_keys padding batch_size n_heads is_multiquery dtype
-n_wavefronts_per_block
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
+ ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block
 */
 
+// clang-format on
+
 static void do_correctness_check() {
   const int32_t D = 4 * kThreadsPerWavefront;
   const int32_t B = 1;

From 32f7cd567ee2ef501908a935cf76675a0f74057f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 3 Nov 2023 14:53:12 -0400
Subject: [PATCH 188/641] separate the ck op and pytorch op backend

---
 .../hip_fmha/attention_forward_decoder.cpp    | 446 +-----------------
 .../hip_fmha/ck_attention_forward_decoder.h   | 427 +++++++++++++++++
 2 files changed, 441 insertions(+), 432 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 98031081b..8b5b88f03 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -7,457 +7,36 @@
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
 #include <c10/cuda/CUDAStream.h>
-#include <ck/host_utility/kernel_launch_hip.hpp>
-#include <ck/stream_config.hpp>
-#include <ck/tensor_operation/gpu/device/device_base.hpp>
-#include <ck/utility/data_type.hpp>
-#include <ck/utility/inner_product.hpp>
 #include <torch/library.h>
 
-namespace ck {
-template <>
-__device__ void inner_product<bhalf_t, bhalf_t, float>(
-    const bhalf_t& a,
-    const bhalf_t& b,
-    float& c) {
-  inner_product(type_convert<float>(a), type_convert<float>(b), c);
-}
+#include "ck_attention_forward_decoder.h"
 
-template <>
-__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
-    const bhalf4_t& a,
-    const bhalf4_t& b,
-    float& c) {
-  const vector_type<bhalf_t, 4> a_vector{a};
-  const vector_type<bhalf_t, 4> b_vector{b};
-  ck::static_for<0, 4, 1>{}([&](auto i) {
-    inner_product(
-        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-  });
+namespace {
+  constexpr int32_t kThreadsPerWavefront = 64;
+  constexpr int32_t kWavefrontsPerBlock = 16;
+  constexpr int32_t D_H = 4 * kThreadsPerWavefront;
 }
-} // namespace ck
 
 namespace {
 
-constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock = 16;
-constexpr int32_t D_H = 4 * kThreadsPerWavefront;
-constexpr int32_t T_MAX = 8192;
-
 template <typename c10_t>
 struct c10_to_data_t;
-
 template <>
 struct c10_to_data_t<float> {
   using type = float;
-  using vec4 = ck::float4_t;
 };
 
 template <>
 struct c10_to_data_t<c10::Half> {
   using type = ck::half_t;
-  using vec4 = ck::half4_t;
 };
 
 template <>
 struct c10_to_data_t<c10::BFloat16> {
   using type = ck::bhalf_t;
-  using vec4 = ck::bhalf4_t;
 };
-
-template <typename data4_t>
-__device__ ck::float4_t scalar4_scale_acc(ck::float4_t acc, data4_t a, float b);
-
-template <>
-__device__ ck::float4_t scalar4_scale_acc<ck::float4_t>(
-    ck::float4_t acc,
-    ck::float4_t a,
-    float b) {
-  return acc + a * b;
-}
-
-template <>
-__device__ ck::float4_t scalar4_scale_acc<ck::half4_t>(
-    ck::float4_t acc,
-    ck::half4_t a,
-    float b) {
-  acc.x += ck::type_convert<float>(a.x) * b;
-  acc.y += ck::type_convert<float>(a.y) * b;
-  acc.z += ck::type_convert<float>(a.z) * b;
-  acc.w += ck::type_convert<float>(a.w) * b;
-  return acc;
-}
-
-template <>
-__device__ ck::float4_t scalar4_scale_acc<ck::bhalf4_t>(
-    ck::float4_t acc,
-    ck::bhalf4_t a,
-    float b) {
-  acc.x += ck::type_convert<float>(a.x) * b;
-  acc.y += ck::type_convert<float>(a.y) * b;
-  acc.z += ck::type_convert<float>(a.z) * b;
-  acc.w += ck::type_convert<float>(a.w) * b;
-  return acc;
 }
 
-template <typename F>
-float __device__ __forceinline__ wavefrontReduce(float val, F f) {
-#pragma unroll
-  for (int32_t mask = kThreadsPerWavefront >> 1; mask > 0; mask >>= 1) {
-    val = f(__shfl_xor(val, mask, kThreadsPerWavefront), val);
-  }
-  return val;
-}
-
-template <typename TDataPtr, typename TDataVec>
-__forceinline__ __device__ void load_v(
-    TDataPtr data_ptr,
-    int32_t vector_offset,
-    TDataVec* load_to) {
-  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
-}
-
-template <typename TDataPtr, typename TDataVec>
-__forceinline__ __device__ void store_v(
-    TDataPtr data_ptr,
-    int32_t vector_offset,
-    TDataVec value) {
-  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
-}
-
-template <
-    typename scalar_t,
-    int32_t n_loop_unroll = 16,
-    int32_t n_loop_unroll_tail = 2>
-__global__ void efficient_attention_forward_decoder_ck_kernel(
-    const scalar_t* __restrict__ XQ,
-    const scalar_t* __restrict__ cache_K,
-    const scalar_t* __restrict__ cache_V,
-    scalar_t* __restrict__ O,
-    const int32_t* __restrict__ seq_positions,
-    const ptrdiff_t XQ_stride_0,
-    const ptrdiff_t XQ_stride_2,
-    const ptrdiff_t K_stride_0,
-    const ptrdiff_t K_stride_1,
-    const ptrdiff_t K_stride_2,
-    const bool multiquery,
-    const float qk_scale) {
-  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
-
-  constexpr int32_t seq_positions_shift = 0;
-
-  extern __shared__ __align__(16) float smem[];
-
-  // Each block handles a single batch and head
-  const int32_t b = blockIdx.x;
-  const int32_t h = blockIdx.y;
-
-  // Note: this is decoding case where we attend to current and all previous
-  // tokens.
-  const int32_t t_max = seq_positions[b] + seq_positions_shift;
-
-  const int32_t lane_idx = threadIdx.x;
-  const int32_t wavefront_idx = threadIdx.y;
-  const int32_t threads_per_wavefront = blockDim.x;
-  const int32_t wavefronts_per_block = blockDim.y;
-  const int32_t threads_per_block =
-      threads_per_wavefront * wavefronts_per_block;
-  const int32_t thread_linear_idx =
-      lane_idx + wavefront_idx * threads_per_wavefront;
-
-  // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
-  // const auto* q_ = &(XQ_acc[b][0][h][0]);
-  const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
-  const auto* q_ = XQ + XQO_base_offset;
-
-  const auto cache_KV_base_offset =
-      b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
-  const auto* cache_K_base = cache_K + cache_KV_base_offset;
-  const auto* cache_V_base = cache_V + cache_KV_base_offset;
-
-  // Load Q into registers in all wavefronts.
-  // Each thread handles 4 D dimensions
-  using data_t = typename c10_to_data_t<scalar_t>::type;
-  using data_vec4_t = typename c10_to_data_t<scalar_t>::vec4;
-  data_vec4_t q_thread;
-  load_v<decltype(q_), data_vec4_t>(q_, lane_idx, &q_thread);
-  // Each block computes different B value
-  float max_qk_acc = std::numeric_limits<float>::lowest();
-
-  // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-  // Split T across wavefronts in a block, unroll loads to expose more
-  // parallelism.
-
-  data_vec4_t k_loads[n_loop_unroll];
-
-  constexpr auto dtt = kWavefrontsPerBlock * n_loop_unroll;
-  const int32_t t_max_unroll = (t_max / dtt) * dtt;
-
-  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
-#pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      const int32_t t = tt + ttt;
-      // load the K[b][t][h|0][:] row into registers
-      load_v<decltype(cache_K_base), data_vec4_t>(
-          cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-    }
-    float qk_accs[n_loop_unroll] = {};
-#pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      ck::inner_product<data_vec4_t, data_vec4_t, float>(
-          q_thread, k_loads[ttt], qk_accs[ttt]);
-      qk_accs[ttt] *= qk_scale;
-
-      qk_accs[ttt] =
-          wavefrontReduce(qk_accs[ttt], [](float a, float b) { return a + b; });
-      max_qk_acc = max(qk_accs[ttt], max_qk_acc);
-    }
-    if (lane_idx == 0) {
-      auto* smem_base = smem + tt;
-#pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        smem_base[ttt] = qk_accs[ttt];
-      }
-    }
-  }
-
-  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-       tt += wavefronts_per_block * n_loop_unroll_tail) {
-#pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        // load the K[b][t][h|0][:] row into registers
-        load_v<decltype(cache_K_base), data_vec4_t>(
-            cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      }
-    }
-#pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      float qk_acc = 0;
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        ck::inner_product<data_vec4_t, data_vec4_t, float>(
-            q_thread, k_loads[ttt], qk_acc);
-        qk_acc *= qk_scale;
-
-        qk_acc =
-            wavefrontReduce(qk_acc, [](float a, float b) { return a + b; });
-        max_qk_acc = max(qk_acc, max_qk_acc);
-
-        // write accumulated sums to smem.
-        if (lane_idx == 0) {
-          smem[t] = qk_acc;
-        }
-      }
-    }
-  }
-
-  // Use shared reduction to compute max and compute softmax on shared memory.
-  // write max acc
-  if (lane_idx == 0) {
-    smem[T_MAX + wavefront_idx] = max_qk_acc;
-  }
-  __syncthreads();
-  if (lane_idx < wavefronts_per_block) {
-    max_qk_acc = max(max_qk_acc, smem[T_MAX + lane_idx]);
-  }
-  // shared across all threads in block
-  max_qk_acc = wavefrontReduce(
-      max_qk_acc, [](float a, float b) { return a > b ? a : b; });
-
-  // each wavefront computes partial sum of exp.
-  float softmax_denominator = 0.0f;
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    softmax_denominator += __expf(smem[t] - max_qk_acc);
-  }
-  softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](float a, float b) { return a + b; });
-
-  __syncthreads();
-  if (lane_idx == 0) {
-    smem[T_MAX + wavefront_idx] = softmax_denominator;
-  }
-  __syncthreads();
-
-  // now, compute sum of exp(x - max(x)) over all intermediate results.
-  softmax_denominator = 0.0;
-  if (lane_idx < wavefronts_per_block) {
-    softmax_denominator = smem[T_MAX + lane_idx];
-  }
-  softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](float a, float b) { return a + b; });
-
-  const float softmax_scale_factor = 1. / softmax_denominator;
-  // now, compute the normalization across all threads.
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    smem[t] = __expf(smem[t] - max_qk_acc) * softmax_scale_factor;
-  }
-  __syncthreads();
-
-  // Now, we can compute the softmax and write the outputs.
-
-  // Split T across wavefronts in a block
-  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-  // outputs are of size float[D]
-
-  float ps[n_loop_unroll];
-  ck::float4_t o_acc = 0;
-  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
-#pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      const int32_t t = tt + ttt;
-      // load the V[b][t][h|0][:] row into registers, reusing K register storage
-      load_v<decltype(cache_V_base), data_vec4_t>(
-          cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      ps[ttt] = smem[t];
-    }
-
-#pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
-    }
-  }
-
-  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-       tt += wavefronts_per_block * n_loop_unroll_tail) {
-#pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        // load the V[b][t][h|0][:] row into registers, reusing K register
-        // storage
-        load_v<decltype(cache_V_base), data_vec4_t>(
-            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-        ps[ttt] = smem[t];
-      }
-    }
-
-#pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
-      }
-    }
-  }
-  // now, each thread has partial sums. Write to smem and get accumulated
-  // results back.
-  __syncthreads();
-
-  // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
-  store_v<float*, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
-
-  __syncthreads();
-  // sum up partial D rows from other wavefronts
-  if (wavefront_idx == 0) {
-    ck::float4_t r = 0;
-    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      ck::float4_t partial_r;
-      load_v<float*, ck::float4_t>(
-          smem, w * threads_per_wavefront + lane_idx, &partial_r);
-      r += partial_r;
-    }
-    // write output D row
-    data_vec4_t bf_r;
-    bf_r.x = ck::type_convert<data_t>(r.x);
-    bf_r.y = ck::type_convert<data_t>(r.y);
-    bf_r.z = ck::type_convert<data_t>(r.z);
-    bf_r.w = ck::type_convert<data_t>(r.w);
-    auto* o_ = O + XQO_base_offset;
-    store_v<decltype(o_), data_vec4_t>(o_, lane_idx, bf_r);
-  }
-}
-
-} // namespace
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-template <typename scalar_t>
-struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
-  using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-  struct Argument : public BaseArgument {
-    const scalar_t* __restrict__ XQ;
-    const scalar_t* __restrict__ cache_K;
-    const scalar_t* __restrict__ cache_V;
-    scalar_t* __restrict__ O;
-    const int32_t* __restrict__ seq_positions;
-    const ptrdiff_t XQ_stride_0;
-    const ptrdiff_t XQ_stride_2;
-    const ptrdiff_t K_stride_0;
-    const ptrdiff_t K_stride_1;
-    const ptrdiff_t K_stride_2;
-    const bool multiquery;
-    const float qk_scale;
-
-    const dim3 grid_dim;
-    const dim3 block_dim;
-    const size_t lds_bytes;
-
-    Argument(
-        const scalar_t* __restrict__ XQ,
-        const scalar_t* __restrict__ cache_K,
-        const scalar_t* __restrict__ cache_V,
-        scalar_t* __restrict__ O,
-        const int32_t* __restrict__ seq_positions,
-        const ptrdiff_t XQ_stride_0,
-        const ptrdiff_t XQ_stride_2,
-        const ptrdiff_t K_stride_0,
-        const ptrdiff_t K_stride_1,
-        const ptrdiff_t K_stride_2,
-        const bool multiquery,
-        const float qk_scale,
-        const dim3 grid_dim,
-        const dim3 block_dim,
-        const size_t lds_bytes)
-        : XQ(XQ),
-          cache_K(cache_K),
-          cache_V(cache_V),
-          O(O),
-          seq_positions(seq_positions),
-          XQ_stride_0(XQ_stride_0),
-          XQ_stride_2(XQ_stride_2),
-          K_stride_0(K_stride_0),
-          K_stride_1(K_stride_1),
-          K_stride_2(K_stride_2),
-          multiquery(multiquery),
-          qk_scale(qk_scale),
-          grid_dim(grid_dim),
-          block_dim(block_dim),
-          lds_bytes(lds_bytes) {}
-  };
-  struct Invoker : public BaseInvoker {
-    using Argument = DeviceOp::Argument;
-    float Run(
-        const Argument& arg,
-        const StreamConfig& stream_config = StreamConfig{}) {
-      return launch_and_time_kernel(
-          stream_config,
-          efficient_attention_forward_decoder_ck_kernel<scalar_t>,
-          arg.grid_dim,
-          arg.block_dim,
-          arg.lds_bytes,
-          arg.XQ,
-          arg.cache_K,
-          arg.cache_V,
-          arg.O,
-          arg.seq_positions,
-          arg.XQ_stride_0,
-          arg.XQ_stride_2,
-          arg.K_stride_0,
-          arg.K_stride_1,
-          arg.K_stride_2,
-          arg.multiquery,
-          arg.qk_scale);
-    }
-  };
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
 namespace {
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
@@ -472,7 +51,9 @@ namespace {
       NAME,                                                 \
       AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
 
-template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
+template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock,
+          int32_t T_MAX = 8192,
+          int32_t D_H = 256>
 at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
@@ -511,8 +92,9 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
       XQ.scalar_type(),
       "efficient_attention_forward_decoder_ck",
       [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
         using device_op_t =
-            ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<scalar_t>;
+            ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<ck_data_t>;
         auto op = device_op_t{};
 
         auto XQ_acc =
@@ -526,10 +108,10 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
             seq_positions
                 .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
         auto arg = device_op_t::Argument(
-            XQ_acc.data(),
-            K_acc.data(),
-            V_acc.data(),
-            O_acc.data(),
+            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
             seq_acc.data(),
             XQ_acc.stride(0),
             XQ_acc.stride(2),
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
new file mode 100644
index 000000000..be4cc790e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -0,0 +1,427 @@
+#pragma once 
+
+#include <ck/host_utility/kernel_launch_hip.hpp>
+#include <ck/stream_config.hpp>
+#include <ck/tensor_operation/gpu/device/device_base.hpp>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/inner_product.hpp>
+
+namespace ck {
+template <>
+__device__ void inner_product<bhalf_t, bhalf_t, float>(
+    const bhalf_t& a,
+    const bhalf_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
+    const bhalf4_t& a,
+    const bhalf4_t& b,
+    float& c) {
+  const vector_type<bhalf_t, 4> a_vector{a};
+  const vector_type<bhalf_t, 4> b_vector{b};
+  ck::static_for<0, 4, 1>{}([&](auto i) {
+    inner_product(
+        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+  });
+}
+} // namespace ck
+
+namespace {
+
+template <typename data4_t>
+__device__ ck::float4_t scalar4_scale_acc(ck::float4_t acc, data4_t a, float b);
+
+template <>
+__device__ ck::float4_t scalar4_scale_acc<ck::float4_t>(
+    ck::float4_t acc,
+    ck::float4_t a,
+    float b) {
+  return acc + a * b;
+}
+
+template <>
+__device__ ck::float4_t scalar4_scale_acc<ck::half4_t>(
+    ck::float4_t acc,
+    ck::half4_t a,
+    float b) {
+  acc.x += ck::type_convert<float>(a.x) * b;
+  acc.y += ck::type_convert<float>(a.y) * b;
+  acc.z += ck::type_convert<float>(a.z) * b;
+  acc.w += ck::type_convert<float>(a.w) * b;
+  return acc;
+}
+
+template <>
+__device__ ck::float4_t scalar4_scale_acc<ck::bhalf4_t>(
+    ck::float4_t acc,
+    ck::bhalf4_t a,
+    float b) {
+  acc.x += ck::type_convert<float>(a.x) * b;
+  acc.y += ck::type_convert<float>(a.y) * b;
+  acc.z += ck::type_convert<float>(a.z) * b;
+  acc.w += ck::type_convert<float>(a.w) * b;
+  return acc;
+}
+
+template <typename F, int32_t n_threads_per_wavefront = 64>
+float __device__ __forceinline__ wavefrontReduce(float val, F f) {
+#pragma unroll
+  for (int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1) {
+    val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
+  }
+  return val;
+}
+
+template <typename TDataPtr, typename TDataVec>
+__forceinline__ __device__ void load_v(
+    TDataPtr data_ptr,
+    int32_t vector_offset,
+    TDataVec* load_to) {
+  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+}
+
+template <typename TDataPtr, typename TDataVec>
+__forceinline__ __device__ void store_v(
+    TDataPtr data_ptr,
+    int32_t vector_offset,
+    TDataVec value) {
+  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+}
+
+template <
+    typename scalar_t,
+    int32_t n_loop_unroll = 16,
+    int32_t n_loop_unroll_tail = 2,
+    int32_t T_MAX = 8192,
+    int32_t n_wavefronts_per_block = 16>
+__global__ void efficient_attention_forward_decoder_ck_kernel(
+    const scalar_t* __restrict__ XQ,
+    const scalar_t* __restrict__ cache_K,
+    const scalar_t* __restrict__ cache_V,
+    scalar_t* __restrict__ O,
+    const int32_t* __restrict__ seq_positions,
+    const ptrdiff_t XQ_stride_0,
+    const ptrdiff_t XQ_stride_2,
+    const ptrdiff_t K_stride_0,
+    const ptrdiff_t K_stride_1,
+    const ptrdiff_t K_stride_2,
+    const bool multiquery,
+    const float qk_scale) {
+  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
+
+  constexpr int32_t seq_positions_shift = 0;
+
+  extern __shared__ __align__(16) float smem[];
+
+  // Each block handles a single batch and head
+  const int32_t b = blockIdx.x;
+  const int32_t h = blockIdx.y;
+
+  // Note: this is decoding case where we attend to current and all previous
+  // tokens.
+  const int32_t t_max = seq_positions[b] + seq_positions_shift;
+
+  const int32_t lane_idx = threadIdx.x;
+  const int32_t wavefront_idx = threadIdx.y;
+  const int32_t threads_per_wavefront = blockDim.x;
+  const int32_t wavefronts_per_block = blockDim.y;
+  const int32_t threads_per_block =
+      threads_per_wavefront * wavefronts_per_block;
+  const int32_t thread_linear_idx =
+      lane_idx + wavefront_idx * threads_per_wavefront;
+
+  // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
+  // const auto* q_ = &(XQ_acc[b][0][h][0]);
+  const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
+  const auto* q_ = XQ + XQO_base_offset;
+
+  const auto cache_KV_base_offset =
+      b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+  const auto* cache_K_base = cache_K + cache_KV_base_offset;
+  const auto* cache_V_base = cache_V + cache_KV_base_offset;
+
+  // Load Q into registers in all wavefronts.
+  // Each thread handles 4 D dimensions
+  using data_t = scalar_t;
+  using data_vec4_t = typename ck::vector_type<data_t, 4>::type;
+  data_vec4_t q_thread;
+  load_v<decltype(q_), data_vec4_t>(q_, lane_idx, &q_thread);
+  // Each block computes different B value
+  float max_qk_acc = std::numeric_limits<float>::lowest();
+
+  // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+  // Split T across wavefronts in a block, unroll loads to expose more
+  // parallelism.
+
+  data_vec4_t k_loads[n_loop_unroll];
+
+  constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
+  const int32_t t_max_unroll = (t_max / dtt) * dtt;
+
+  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      const int32_t t = tt + ttt;
+      // load the K[b][t][h|0][:] row into registers
+      load_v<decltype(cache_K_base), data_vec4_t>(
+          cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+    }
+    float qk_accs[n_loop_unroll] = {};
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      ck::inner_product<data_vec4_t, data_vec4_t, float>(
+          q_thread, k_loads[ttt], qk_accs[ttt]);
+      qk_accs[ttt] *= qk_scale;
+
+      qk_accs[ttt] =
+          wavefrontReduce(qk_accs[ttt], [](float a, float b) { return a + b; });
+      max_qk_acc = max(qk_accs[ttt], max_qk_acc);
+    }
+    if (lane_idx == 0) {
+      auto* smem_base = smem + tt;
+#pragma unroll n_loop_unroll
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        smem_base[ttt] = qk_accs[ttt];
+      }
+    }
+  }
+
+  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+       tt += wavefronts_per_block * n_loop_unroll_tail) {
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        // load the K[b][t][h|0][:] row into registers
+        load_v<decltype(cache_K_base), data_vec4_t>(
+            cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+      }
+    }
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      float qk_acc = 0;
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        ck::inner_product<data_vec4_t, data_vec4_t, float>(
+            q_thread, k_loads[ttt], qk_acc);
+        qk_acc *= qk_scale;
+
+        qk_acc =
+            wavefrontReduce(qk_acc, [](float a, float b) { return a + b; });
+        max_qk_acc = max(qk_acc, max_qk_acc);
+
+        // write accumulated sums to smem.
+        if (lane_idx == 0) {
+          smem[t] = qk_acc;
+        }
+      }
+    }
+  }
+
+  // Use shared reduction to compute max and compute softmax on shared memory.
+  // write max acc
+  if (lane_idx == 0) {
+    smem[T_MAX + wavefront_idx] = max_qk_acc;
+  }
+  __syncthreads();
+  if (lane_idx < wavefronts_per_block) {
+    max_qk_acc = max(max_qk_acc, smem[T_MAX + lane_idx]);
+  }
+  // shared across all threads in block
+  max_qk_acc = wavefrontReduce(
+      max_qk_acc, [](float a, float b) { return a > b ? a : b; });
+
+  // each wavefront computes partial sum of exp.
+  float softmax_denominator = 0.0f;
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
+    softmax_denominator += __expf(smem[t] - max_qk_acc);
+  }
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](float a, float b) { return a + b; });
+
+  __syncthreads();
+  if (lane_idx == 0) {
+    smem[T_MAX + wavefront_idx] = softmax_denominator;
+  }
+  __syncthreads();
+
+  // now, compute sum of exp(x - max(x)) over all intermediate results.
+  softmax_denominator = 0.0;
+  if (lane_idx < wavefronts_per_block) {
+    softmax_denominator = smem[T_MAX + lane_idx];
+  }
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](float a, float b) { return a + b; });
+
+  const float softmax_scale_factor = 1. / softmax_denominator;
+  // now, compute the normalization across all threads.
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
+    smem[t] = __expf(smem[t] - max_qk_acc) * softmax_scale_factor;
+  }
+  __syncthreads();
+
+  // Now, we can compute the softmax and write the outputs.
+
+  // Split T across wavefronts in a block
+  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+  // outputs are of size float[D]
+
+  float ps[n_loop_unroll];
+  ck::float4_t o_acc = 0;
+  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      const int32_t t = tt + ttt;
+      // load the V[b][t][h|0][:] row into registers, reusing K register storage
+      load_v<decltype(cache_V_base), data_vec4_t>(
+          cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+      ps[ttt] = smem[t];
+    }
+
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
+    }
+  }
+
+  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+       tt += wavefronts_per_block * n_loop_unroll_tail) {
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        // load the V[b][t][h|0][:] row into registers, reusing K register
+        // storage
+        load_v<decltype(cache_V_base), data_vec4_t>(
+            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+        ps[ttt] = smem[t];
+      }
+    }
+
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
+      }
+    }
+  }
+  // now, each thread has partial sums. Write to smem and get accumulated
+  // results back.
+  __syncthreads();
+
+  // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
+  store_v<float*, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
+
+  __syncthreads();
+  // sum up partial D rows from other wavefronts
+  if (wavefront_idx == 0) {
+    ck::float4_t r = 0;
+    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
+      ck::float4_t partial_r;
+      load_v<float*, ck::float4_t>(
+          smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      r += partial_r;
+    }
+    // write output D row
+    data_vec4_t bf_r;
+    bf_r.x = ck::type_convert<data_t>(r.x);
+    bf_r.y = ck::type_convert<data_t>(r.y);
+    bf_r.z = ck::type_convert<data_t>(r.z);
+    bf_r.w = ck::type_convert<data_t>(r.w);
+    auto* o_ = O + XQO_base_offset;
+    store_v<decltype(o_), data_vec4_t>(o_, lane_idx, bf_r);
+  }
+}
+
+} // namespace
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename scalar_t>
+struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    const int32_t* __restrict__ seq_positions;
+    const ptrdiff_t XQ_stride_0;
+    const ptrdiff_t XQ_stride_2;
+    const ptrdiff_t K_stride_0;
+    const ptrdiff_t K_stride_1;
+    const ptrdiff_t K_stride_2;
+    const bool multiquery;
+    const float qk_scale;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        const int32_t* __restrict__ seq_positions,
+        const ptrdiff_t XQ_stride_0,
+        const ptrdiff_t XQ_stride_2,
+        const ptrdiff_t K_stride_0,
+        const ptrdiff_t K_stride_1,
+        const ptrdiff_t K_stride_2,
+        const bool multiquery,
+        const float qk_scale,
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          seq_positions(seq_positions),
+          XQ_stride_0(XQ_stride_0),
+          XQ_stride_2(XQ_stride_2),
+          K_stride_0(K_stride_0),
+          K_stride_1(K_stride_1),
+          K_stride_2(K_stride_2),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+  };
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      return launch_and_time_kernel(
+          stream_config,
+          efficient_attention_forward_decoder_ck_kernel<scalar_t>,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.O,
+          arg.seq_positions,
+          arg.XQ_stride_0,
+          arg.XQ_stride_2,
+          arg.K_stride_0,
+          arg.K_stride_1,
+          arg.K_stride_2,
+          arg.multiquery,
+          arg.qk_scale);
+    }
+  };
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
\ No newline at end of file

From a4687e13b41db5e509b3eb68588c1d802ea8bba8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 7 Nov 2023 16:14:51 +0000
Subject: [PATCH 189/641] Tiny removing useless declaration

---
 .../csrc/attention/hip_fmha/ck_fmha_batched_forward.h    | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 7959bb088..80d440fa6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -52,15 +52,6 @@ struct batched_forward_masktype_attnbias_dispatched {
       static_cast<ck::tensor_operation::device::MaskingSpecialization>(
           custom_mask_type);
 
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
   static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_FORWARD_HEADDIM_SWITCH

From 41a9502526de1a5318133a9a11abb1d5affecd4c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 8 Nov 2023 10:03:45 +0000
Subject: [PATCH 190/641] Use function wrapper instantiation to replace class
 instantiation to avoid inline compiling

---
 .../hip_fmha/ck_fmha_batched_backward.h       | 15 +++++
 .../ck_fmha_batched_backward_bp16.cpp         | 62 +++++++++----------
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |  8 +--
 ...backward_bp16_masktype_0_with_attnbias.cpp |  8 +--
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |  8 +--
 ...backward_bp16_masktype_1_with_attnbias.cpp |  8 +--
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |  8 +--
 ...backward_bp16_masktype_2_with_attnbias.cpp |  8 +--
 .../ck_fmha_batched_backward_fp16.cpp         | 62 +++++++++----------
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |  8 +--
 ...backward_fp16_masktype_0_with_attnbias.cpp |  8 +--
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |  8 +--
 ...backward_fp16_masktype_1_with_attnbias.cpp |  8 +--
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |  8 +--
 ...backward_fp16_masktype_2_with_attnbias.cpp |  8 +--
 .../hip_fmha/ck_fmha_batched_forward.h        |  9 +++
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp | 38 ++++++------
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |  4 +-
 ..._forward_bp16_masktype_0_with_attnbias.cpp |  4 +-
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |  4 +-
 ..._forward_bp16_masktype_1_with_attnbias.cpp |  4 +-
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |  4 +-
 ..._forward_bp16_masktype_2_with_attnbias.cpp |  4 +-
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp | 38 ++++++------
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |  4 +-
 ..._forward_fp16_masktype_0_with_attnbias.cpp |  4 +-
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |  4 +-
 ..._forward_fp16_masktype_1_with_attnbias.cpp |  4 +-
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |  4 +-
 ..._forward_fp16_masktype_2_with_attnbias.cpp |  4 +-
 .../hip_fmha/ck_fmha_batched_infer.h          | 10 +++
 .../hip_fmha/ck_fmha_batched_infer_bp16.cpp   | 38 ++++++------
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |  5 +-
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |  5 +-
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |  5 +-
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |  5 +-
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |  5 +-
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |  5 +-
 .../hip_fmha/ck_fmha_batched_infer_fp16.cpp   | 44 +++++++------
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |  5 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |  6 +-
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |  5 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |  6 +-
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |  5 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |  6 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       | 15 +++++
 .../ck_fmha_grouped_backward_bp16.cpp         | 62 +++++++++----------
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |  8 +--
 ...backward_bp16_masktype_0_with_attnbias.cpp |  8 +--
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |  8 +--
 ...backward_bp16_masktype_1_with_attnbias.cpp |  8 +--
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |  8 +--
 ...backward_bp16_masktype_2_with_attnbias.cpp |  8 +--
 .../ck_fmha_grouped_backward_fp16.cpp         | 62 +++++++++----------
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |  8 +--
 ...backward_fp16_masktype_0_with_attnbias.cpp |  8 +--
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |  8 +--
 ...backward_fp16_masktype_1_with_attnbias.cpp |  8 +--
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |  8 +--
 ...backward_fp16_masktype_2_with_attnbias.cpp |  8 +--
 .../hip_fmha/ck_fmha_grouped_forward.h        | 10 +++
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp | 38 ++++++------
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |  4 +-
 ..._forward_bp16_masktype_0_with_attnbias.cpp |  4 +-
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |  4 +-
 ..._forward_bp16_masktype_1_with_attnbias.cpp |  4 +-
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |  4 +-
 ..._forward_bp16_masktype_2_with_attnbias.cpp |  4 +-
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp | 38 ++++++------
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |  4 +-
 ..._forward_fp16_masktype_0_with_attnbias.cpp |  4 +-
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |  4 +-
 ..._forward_fp16_masktype_1_with_attnbias.cpp |  4 +-
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |  4 +-
 ..._forward_fp16_masktype_2_with_attnbias.cpp |  4 +-
 .../hip_fmha/ck_fmha_grouped_infer.h          | 10 +++
 .../hip_fmha/ck_fmha_grouped_infer_bp16.cpp   | 38 ++++++------
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |  5 +-
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |  5 +-
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |  5 +-
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |  5 +-
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |  5 +-
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |  5 +-
 .../hip_fmha/ck_fmha_grouped_infer_fp16.cpp   | 44 +++++++------
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |  5 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |  6 +-
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |  5 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |  6 +-
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |  5 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |  6 +-
 90 files changed, 561 insertions(+), 486 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 9de59b5bd..1663e9c52 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -500,3 +500,18 @@ struct batched_backward_masktype_attnbias_dispatched {
     (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
 };
+
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
+void run_batched_backward_masktype_attnbias_dispatched(
+    BatchedBackwardParams& param,
+    hipStream_t stream) {
+  batched_backward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias,
+      use_fp32_qkv_grad>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 81615faf9..441a4f9cf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -1,80 +1,80 @@
 #include <ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_batched_backward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_batched_backward.h"
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
@@ -84,23 +84,23 @@ void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
       USE_FP32_QKV_GRAD,
       [&] {
         if (param.custom_mask_type == 0)
-          batched_backward_masktype_attnbias_dispatched<
+          run_batched_backward_masktype_attnbias_dispatched<
               ck::bhalf_t,
               0,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         else if (param.custom_mask_type == 1)
-          batched_backward_masktype_attnbias_dispatched<
+          run_batched_backward_masktype_attnbias_dispatched<
               ck::bhalf_t,
               1,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         else if (param.custom_mask_type == 2)
-          batched_backward_masktype_attnbias_dispatched<
+          run_batched_backward_masktype_attnbias_dispatched<
               ck::bhalf_t,
               2,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         else
           throw std::runtime_error("Invalid custom_mask_type value");
       });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
index 52541f380..2bf962a9f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
index 7bf0a5959..b3c5bbf70 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
index 6420ddf15..4a96b4a3d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
index b10c2895c..37ec0f03c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
index aca4acbf2..c80a47952 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
index c8ef03050..c1dc61c5a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index 3527beba7..1868a5957 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -1,80 +1,80 @@
 #include <ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_batched_backward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_batched_backward.h"
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template struct batched_backward_masktype_attnbias_dispatched<
+extern template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
@@ -84,23 +84,23 @@ void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
       USE_FP32_QKV_GRAD,
       [&] {
         if (param.custom_mask_type == 0)
-          batched_backward_masktype_attnbias_dispatched<
+          run_batched_backward_masktype_attnbias_dispatched<
               ck::half_t,
               0,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         else if (param.custom_mask_type == 1)
-          batched_backward_masktype_attnbias_dispatched<
+          run_batched_backward_masktype_attnbias_dispatched<
               ck::half_t,
               1,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         else if (param.custom_mask_type == 2)
-          batched_backward_masktype_attnbias_dispatched<
+          run_batched_backward_masktype_attnbias_dispatched<
               ck::half_t,
               2,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         else
           throw std::runtime_error("Invalid custom_mask_type value");
       });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
index 6421a77b3..46caaa20d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
index 7e7bc9ad4..c328beb8d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
index cbfa45b67..2897cba5d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
index dc2df739a..62b82e22a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
@@ -3,14 +3,14 @@
 
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
index 1f77acb1c..1ea6309d6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
index 5743fb768..24f2ce4b2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    true>;
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-template struct batched_backward_masktype_attnbias_dispatched<
+template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    false>;
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 80d440fa6..7b5193256 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -360,3 +360,12 @@ struct batched_forward_masktype_attnbias_dispatched {
     invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
 };
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void run_batched_forward_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
+{
+  batched_forward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 865c2de58..91d73009d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -1,56 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_batched_forward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_batched_forward.h"
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      batched_forward_masktype_attnbias_dispatched<
+      run_batched_forward_masktype_attnbias_dispatched<
           ck::bhalf_t,
           0,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      batched_forward_masktype_attnbias_dispatched<
+      run_batched_forward_masktype_attnbias_dispatched<
           ck::bhalf_t,
           1,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      batched_forward_masktype_attnbias_dispatched<
+      run_batched_forward_masktype_attnbias_dispatched<
           ck::bhalf_t,
           2,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
index be1d4f58d..140cffce0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
index 54091ff9b..bb32b63ef 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
index 8f2778fd6..6ba23b3a2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
index da35f17b9..400df0b3d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
index f775af0d6..a99486148 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
index ad9950d93..23305b07a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index fe8371bb4..557f6fb8a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -1,56 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_batched_forward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_batched_forward.h"
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_forward_masktype_attnbias_dispatched<
+extern template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      batched_forward_masktype_attnbias_dispatched<
+      run_batched_forward_masktype_attnbias_dispatched<
           ck::half_t,
           0,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      batched_forward_masktype_attnbias_dispatched<
+      run_batched_forward_masktype_attnbias_dispatched<
           ck::half_t,
           1,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      batched_forward_masktype_attnbias_dispatched<
+      run_batched_forward_masktype_attnbias_dispatched<
           ck::half_t,
           2,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
index 8af5e20f8..a9dd771de 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
index 22568941d..f653451ab 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
index 466dcc9a3..5ca4b7dda 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
index 7346ec804..f9af4528d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
index c7f68924b..44e98d9a3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
index d7a5106f8..8dfc288f8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template struct batched_forward_masktype_attnbias_dispatched<
+template void run_batched_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index adf04e82a..c76a30b73 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -340,3 +340,13 @@ struct batched_infer_masktype_attnbias_dispatched {
     invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
 };
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void run_batched_infer_masktype_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  batched_infer_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
index 095487f92..628f7ec84 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
@@ -1,56 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_batched_infer.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_batched_infer.h"
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      batched_infer_masktype_attnbias_dispatched<
+      run_batched_infer_masktype_attnbias_dispatched<
           ck::bhalf_t,
           0,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      batched_infer_masktype_attnbias_dispatched<
+      run_batched_infer_masktype_attnbias_dispatched<
           ck::bhalf_t,
           1,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      batched_infer_masktype_attnbias_dispatched<
+      run_batched_infer_masktype_attnbias_dispatched<
           ck::bhalf_t,
           2,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
index 9e1947e67..9748955e1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
index e6c5c49fe..418f925c2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
index 9227f7063..a7cdb48b8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
index fab028901..578855b9b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
index 0d7a88e0e..35e9bca9c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
index 57af33adb..e27e3b5ff 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
index 8e5b01fa0..5e4c861c2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
@@ -1,50 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_batched_infer.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_batched_infer.h"
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    true>;
+    true>(BatchedForwardParams& param, hipStream_t stream);
 
-extern template struct batched_infer_masktype_attnbias_dispatched<
+extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>::
-          Run(param, stream);
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>::
-          Run(param, stream);
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>::
-          Run(param, stream);
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
index 838baed94..5c83b0abd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
index 0d5f091c2..11c76b35f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,6 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
index 21324abb5..b13f5a4c9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
index 0e8a8c384..12f5991c4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,6 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
index 19b4aa0f7..8d45859e5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<
+template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
index e471b0550..9f03be2b5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,6 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_batched_infer.h"
 
-template struct batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index b3d5d917f..71674bda7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -501,3 +501,18 @@ struct grouped_backward_masktype_attnbias_dispatched {
     (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
 };
+
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
+void run_grouped_backward_masktype_attnbias_dispatched(
+    GroupedBackwardParams& param,
+    hipStream_t stream) {
+  grouped_backward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias,
+      use_fp32_qkv_grad>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index 709a4328f..89a73b3d1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -1,80 +1,80 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_grouped_backward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_grouped_backward.h"
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
 void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
@@ -84,23 +84,23 @@ void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
       USE_FP32_QKV_GRAD,
       [&] {
         if (param.custom_mask_type == 0) {
-          grouped_backward_masktype_attnbias_dispatched<
+          run_grouped_backward_masktype_attnbias_dispatched<
               ck::bhalf_t,
               0,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         } else if (param.custom_mask_type == 1) {
-          grouped_backward_masktype_attnbias_dispatched<
+          run_grouped_backward_masktype_attnbias_dispatched<
               ck::bhalf_t,
               1,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         } else if (param.custom_mask_type == 2) {
-          grouped_backward_masktype_attnbias_dispatched<
+          run_grouped_backward_masktype_attnbias_dispatched<
               ck::bhalf_t,
               2,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         } else
           throw std::runtime_error("Invalid custom_mask_type value");
       });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
index 558cd3d68..1b261e938 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
index 52e36a445..8cb42c808 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
index 47e5e97e5..ebefe8bab 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
index 542226d72..1d7de293e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
index 833c49504..524fb30e5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
index 6772bbac7..58f2f8b1a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index 2885df9b5..c0e35f63d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -1,80 +1,80 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_grouped_backward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_grouped_backward.h"
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template struct grouped_backward_masktype_attnbias_dispatched<
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
@@ -84,23 +84,23 @@ void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
       USE_FP32_QKV_GRAD,
       [&] {
         if (param.custom_mask_type == 0) {
-          grouped_backward_masktype_attnbias_dispatched<
+          run_grouped_backward_masktype_attnbias_dispatched<
               ck::half_t,
               0,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         } else if (param.custom_mask_type == 1) {
-          grouped_backward_masktype_attnbias_dispatched<
+          run_grouped_backward_masktype_attnbias_dispatched<
               ck::half_t,
               1,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         } else if (param.custom_mask_type == 2) {
-          grouped_backward_masktype_attnbias_dispatched<
+          run_grouped_backward_masktype_attnbias_dispatched<
               ck::half_t,
               2,
               HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>::Run(param, stream);
+              USE_FP32_QKV_GRAD>(param, stream);
         } else
           throw std::runtime_error("Invalid custom_mask_type value");
       });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
index 85d0fbfd7..1098e69be 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
index 69a3839e7..60583a859 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
index 7e826ab00..b8aabeb86 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
index 1235af9a6..8629a947a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
index 1cec428a6..00b0f5c32 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     false,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
index c01bea26b..8b6112aba 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
@@ -1,14 +1,14 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    true>;
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-template struct grouped_backward_masktype_attnbias_dispatched<
+template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
     true,
-    false>;
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 3e388414b..9eebcfa14 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -357,3 +357,13 @@ struct grouped_forward_masktype_attnbias_dispatched {
     (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
 };
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void run_grouped_forward_masktype_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_forward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index b4b10a60a..030158809 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -1,56 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_grouped_forward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_grouped_forward.h"
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      grouped_forward_masktype_attnbias_dispatched<
+      run_grouped_forward_masktype_attnbias_dispatched<
           ck::bhalf_t,
           0,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      grouped_forward_masktype_attnbias_dispatched<
+      run_grouped_forward_masktype_attnbias_dispatched<
           ck::bhalf_t,
           1,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      grouped_forward_masktype_attnbias_dispatched<
+      run_grouped_forward_masktype_attnbias_dispatched<
           ck::bhalf_t,
           2,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
index 8083cb25c..bfde13c7d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
index a0d3681f1..85e853c36 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
index f877be39f..d86afa1aa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
index aca8091ab..dd58b5b28 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
index f9ade6d61..085245c08 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
index 0014a5e69..8c3ea29a4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index 7c7ef74ad..5338eab35 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -1,56 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_grouped_forward.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_grouped_forward.h"
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_forward_masktype_attnbias_dispatched<
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      grouped_forward_masktype_attnbias_dispatched<
+      run_grouped_forward_masktype_attnbias_dispatched<
           ck::half_t,
           0,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      grouped_forward_masktype_attnbias_dispatched<
+      run_grouped_forward_masktype_attnbias_dispatched<
           ck::half_t,
           1,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      grouped_forward_masktype_attnbias_dispatched<
+      run_grouped_forward_masktype_attnbias_dispatched<
           ck::half_t,
           2,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
index 3d62db850..19adc3971 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
index 1b80b483c..6da5508d3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
index 26d5bccd1..f97de6fb3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
index 3eae0ae71..5bd33901b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
index 9bba3eeca..155c9eb6c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
index 2d5152e87..29f3ed1a3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template struct grouped_forward_masktype_attnbias_dispatched<
+template void run_grouped_forward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 1b907d370..31a90d200 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -341,3 +341,13 @@ struct grouped_infer_masktype_attnbias_dispatched {
     (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
   };
 };
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void run_grouped_infer_masktype_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_infer_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
index 4310d4f39..56c974264 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
@@ -1,56 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_grouped_infer.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_grouped_infer.h"
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      grouped_infer_masktype_attnbias_dispatched<
+      run_grouped_infer_masktype_attnbias_dispatched<
           ck::bhalf_t,
           0,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      grouped_infer_masktype_attnbias_dispatched<
+      run_grouped_infer_masktype_attnbias_dispatched<
           ck::bhalf_t,
           1,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      grouped_infer_masktype_attnbias_dispatched<
+      run_grouped_infer_masktype_attnbias_dispatched<
           ck::bhalf_t,
           2,
-          HAS_ATTN_BIAS>::Run(param, stream);
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
index 67b1dae7c..973213413 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
index 343ba049d..96e0ba425 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
index c42bacba3..332724e73 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
index fc9563043..cb1120f5b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
index 2599755a0..51ed70cab 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
index bf9183e86..c157e89c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
index 9a015601f..0ca1c3eba 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
@@ -1,50 +1,56 @@
 #include <ck/ck.hpp>
 #include <stdexcept>
 
-#include "ck_fmha_grouped_infer.h"
 #include "ck_bool_switch.h"
+#include "ck_fmha_grouped_infer.h"
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    true>;
+    true>(GroupedForwardParams& param, hipStream_t stream);
 
-extern template struct grouped_infer_masktype_attnbias_dispatched<
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
     if (param.custom_mask_type == 0)
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>::
-          Run(param, stream);
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 1)
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>::
-          Run(param, stream);
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
     else if (param.custom_mask_type == 2)
-      grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>::
-          Run(param, stream);
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
     else
       throw std::runtime_error("Invalid custom_mask_type value");
   });
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
index 39b4a9adf..bbcd3ab0e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     0,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
index 7bda05420..e320f5de6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,6 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>;
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
index 34c2c97c0..e763dde6a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     1,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
index 66c2d5724..3ec2d41da 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,6 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>;
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
index ab0d8176d..dee7a0845 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,9 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<
+template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
     2,
-    false>;
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
index 8bcb37f74..b5515e9a0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,6 +1,8 @@
 #include <ck/ck.hpp>
-#include <stdexcept>
 
 #include "ck_fmha_grouped_infer.h"
 
-template struct grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>;
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);

From efab61e55775a3a8610f50c4f84add839cf29adb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 8 Nov 2023 14:43:56 +0000
Subject: [PATCH 191/641] Move instances cpp to instances sub-directory

---
 setup.py                                        |  4 ++--
 ...hed_backward_bp16_masktype_0_no_attnbias.cpp |  0
 ...ched_backward_bp16_masktype_0_no_attnbias.cu | 14 ++++++++++++++
 ...hed_backward_bp16_masktype_0_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_bp16_masktype_0_with_attnbias.cpp |  0
 ...ed_backward_bp16_masktype_0_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_bp16_masktype_0_with_attnbias.hip | 15 +++++++++++++++
 ...hed_backward_bp16_masktype_1_no_attnbias.cpp |  0
 ...ched_backward_bp16_masktype_1_no_attnbias.cu | 14 ++++++++++++++
 ...hed_backward_bp16_masktype_1_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_bp16_masktype_1_with_attnbias.cpp |  0
 ...ed_backward_bp16_masktype_1_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_bp16_masktype_1_with_attnbias.hip | 15 +++++++++++++++
 ...hed_backward_bp16_masktype_2_no_attnbias.cpp |  0
 ...ched_backward_bp16_masktype_2_no_attnbias.cu | 14 ++++++++++++++
 ...hed_backward_bp16_masktype_2_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_bp16_masktype_2_with_attnbias.cpp |  0
 ...ed_backward_bp16_masktype_2_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_bp16_masktype_2_with_attnbias.hip | 15 +++++++++++++++
 ...hed_backward_fp16_masktype_0_no_attnbias.cpp |  0
 ...ched_backward_fp16_masktype_0_no_attnbias.cu | 14 ++++++++++++++
 ...hed_backward_fp16_masktype_0_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_fp16_masktype_0_with_attnbias.cpp |  0
 ...ed_backward_fp16_masktype_0_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_fp16_masktype_0_with_attnbias.hip | 15 +++++++++++++++
 ...hed_backward_fp16_masktype_1_no_attnbias.cpp |  0
 ...ched_backward_fp16_masktype_1_no_attnbias.cu | 14 ++++++++++++++
 ...hed_backward_fp16_masktype_1_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_fp16_masktype_1_with_attnbias.cpp |  0
 ...ed_backward_fp16_masktype_1_with_attnbias.cu | 16 ++++++++++++++++
 ...d_backward_fp16_masktype_1_with_attnbias.hip | 17 +++++++++++++++++
 ...hed_backward_fp16_masktype_2_no_attnbias.cpp |  0
 ...ched_backward_fp16_masktype_2_no_attnbias.cu | 14 ++++++++++++++
 ...hed_backward_fp16_masktype_2_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_fp16_masktype_2_with_attnbias.cpp |  0
 ...ed_backward_fp16_masktype_2_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_fp16_masktype_2_with_attnbias.hip | 15 +++++++++++++++
 ...ched_forward_bp16_masktype_0_no_attnbias.cpp |  0
 ...tched_forward_bp16_masktype_0_no_attnbias.cu |  7 +++++++
 ...ched_forward_bp16_masktype_0_no_attnbias.hip |  8 ++++++++
 ...ed_forward_bp16_masktype_0_with_attnbias.cpp |  0
 ...hed_forward_bp16_masktype_0_with_attnbias.cu |  7 +++++++
 ...ed_forward_bp16_masktype_0_with_attnbias.hip |  8 ++++++++
 ...ched_forward_bp16_masktype_1_no_attnbias.cpp |  0
 ...tched_forward_bp16_masktype_1_no_attnbias.cu |  7 +++++++
 ...ched_forward_bp16_masktype_1_no_attnbias.hip |  8 ++++++++
 ...ed_forward_bp16_masktype_1_with_attnbias.cpp |  0
 ...hed_forward_bp16_masktype_1_with_attnbias.cu |  7 +++++++
 ...ed_forward_bp16_masktype_1_with_attnbias.hip |  8 ++++++++
 ...ched_forward_bp16_masktype_2_no_attnbias.cpp |  0
 ...tched_forward_bp16_masktype_2_no_attnbias.cu |  7 +++++++
 ...ched_forward_bp16_masktype_2_no_attnbias.hip |  8 ++++++++
 ...ed_forward_bp16_masktype_2_with_attnbias.cpp |  0
 ...hed_forward_bp16_masktype_2_with_attnbias.cu |  7 +++++++
 ...ed_forward_bp16_masktype_2_with_attnbias.hip |  8 ++++++++
 ...ched_forward_fp16_masktype_0_no_attnbias.cpp |  0
 ...tched_forward_fp16_masktype_0_no_attnbias.cu |  7 +++++++
 ...ched_forward_fp16_masktype_0_no_attnbias.hip |  8 ++++++++
 ...ed_forward_fp16_masktype_0_with_attnbias.cpp |  0
 ...hed_forward_fp16_masktype_0_with_attnbias.cu |  7 +++++++
 ...ed_forward_fp16_masktype_0_with_attnbias.hip |  8 ++++++++
 ...ched_forward_fp16_masktype_1_no_attnbias.cpp |  0
 ...tched_forward_fp16_masktype_1_no_attnbias.cu |  7 +++++++
 ...ched_forward_fp16_masktype_1_no_attnbias.hip |  8 ++++++++
 ...ed_forward_fp16_masktype_1_with_attnbias.cpp |  0
 ...hed_forward_fp16_masktype_1_with_attnbias.cu |  7 +++++++
 ...ed_forward_fp16_masktype_1_with_attnbias.hip |  8 ++++++++
 ...ched_forward_fp16_masktype_2_no_attnbias.cpp |  0
 ...tched_forward_fp16_masktype_2_no_attnbias.cu |  7 +++++++
 ...ched_forward_fp16_masktype_2_no_attnbias.hip |  8 ++++++++
 ...ed_forward_fp16_masktype_2_with_attnbias.cpp |  0
 ...hed_forward_fp16_masktype_2_with_attnbias.cu |  7 +++++++
 ...ed_forward_fp16_masktype_2_with_attnbias.hip |  8 ++++++++
 ...atched_infer_bp16_masktype_0_no_attnbias.cpp |  0
 ...batched_infer_bp16_masktype_0_no_attnbias.cu |  8 ++++++++
 ...atched_infer_bp16_masktype_0_no_attnbias.hip |  9 +++++++++
 ...ched_infer_bp16_masktype_0_with_attnbias.cpp |  0
 ...tched_infer_bp16_masktype_0_with_attnbias.cu |  8 ++++++++
 ...ched_infer_bp16_masktype_0_with_attnbias.hip |  9 +++++++++
 ...atched_infer_bp16_masktype_1_no_attnbias.cpp |  0
 ...batched_infer_bp16_masktype_1_no_attnbias.cu |  8 ++++++++
 ...atched_infer_bp16_masktype_1_no_attnbias.hip |  9 +++++++++
 ...ched_infer_bp16_masktype_1_with_attnbias.cpp |  0
 ...tched_infer_bp16_masktype_1_with_attnbias.cu |  8 ++++++++
 ...ched_infer_bp16_masktype_1_with_attnbias.hip |  9 +++++++++
 ...atched_infer_bp16_masktype_2_no_attnbias.cpp |  0
 ...batched_infer_bp16_masktype_2_no_attnbias.cu |  8 ++++++++
 ...atched_infer_bp16_masktype_2_no_attnbias.hip |  9 +++++++++
 ...ched_infer_bp16_masktype_2_with_attnbias.cpp |  0
 ...tched_infer_bp16_masktype_2_with_attnbias.cu |  8 ++++++++
 ...ched_infer_bp16_masktype_2_with_attnbias.hip |  9 +++++++++
 ...atched_infer_fp16_masktype_0_no_attnbias.cpp |  0
 ...batched_infer_fp16_masktype_0_no_attnbias.cu |  8 ++++++++
 ...atched_infer_fp16_masktype_0_no_attnbias.hip |  9 +++++++++
 ...ched_infer_fp16_masktype_0_with_attnbias.cpp |  0
 ...tched_infer_fp16_masktype_0_with_attnbias.cu |  8 ++++++++
 ...ched_infer_fp16_masktype_0_with_attnbias.hip |  9 +++++++++
 ...atched_infer_fp16_masktype_1_no_attnbias.cpp |  0
 ...batched_infer_fp16_masktype_1_no_attnbias.cu |  8 ++++++++
 ...atched_infer_fp16_masktype_1_no_attnbias.hip |  9 +++++++++
 ...ched_infer_fp16_masktype_1_with_attnbias.cpp |  0
 ...tched_infer_fp16_masktype_1_with_attnbias.cu |  8 ++++++++
 ...ched_infer_fp16_masktype_1_with_attnbias.hip |  9 +++++++++
 ...atched_infer_fp16_masktype_2_no_attnbias.cpp |  0
 ...batched_infer_fp16_masktype_2_no_attnbias.cu |  8 ++++++++
 ...atched_infer_fp16_masktype_2_no_attnbias.hip |  9 +++++++++
 ...ched_infer_fp16_masktype_2_with_attnbias.cpp |  0
 ...tched_infer_fp16_masktype_2_with_attnbias.cu |  8 ++++++++
 ...ched_infer_fp16_masktype_2_with_attnbias.hip |  9 +++++++++
 ...ped_backward_bp16_masktype_0_no_attnbias.cpp |  0
 ...uped_backward_bp16_masktype_0_no_attnbias.cu | 14 ++++++++++++++
 ...ped_backward_bp16_masktype_0_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_bp16_masktype_0_with_attnbias.cpp |  0
 ...ed_backward_bp16_masktype_0_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_bp16_masktype_0_with_attnbias.hip | 15 +++++++++++++++
 ...ped_backward_bp16_masktype_1_no_attnbias.cpp |  0
 ...uped_backward_bp16_masktype_1_no_attnbias.cu | 14 ++++++++++++++
 ...ped_backward_bp16_masktype_1_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_bp16_masktype_1_with_attnbias.cpp |  0
 ...ed_backward_bp16_masktype_1_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_bp16_masktype_1_with_attnbias.hip | 15 +++++++++++++++
 ...ped_backward_bp16_masktype_2_no_attnbias.cpp |  0
 ...uped_backward_bp16_masktype_2_no_attnbias.cu | 14 ++++++++++++++
 ...ped_backward_bp16_masktype_2_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_bp16_masktype_2_with_attnbias.cpp |  0
 ...ed_backward_bp16_masktype_2_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_bp16_masktype_2_with_attnbias.hip | 15 +++++++++++++++
 ...ped_backward_fp16_masktype_0_no_attnbias.cpp |  0
 ...uped_backward_fp16_masktype_0_no_attnbias.cu | 14 ++++++++++++++
 ...ped_backward_fp16_masktype_0_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_fp16_masktype_0_with_attnbias.cpp |  0
 ...ed_backward_fp16_masktype_0_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_fp16_masktype_0_with_attnbias.hip | 15 +++++++++++++++
 ...ped_backward_fp16_masktype_1_no_attnbias.cpp |  0
 ...uped_backward_fp16_masktype_1_no_attnbias.cu | 14 ++++++++++++++
 ...ped_backward_fp16_masktype_1_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_fp16_masktype_1_with_attnbias.cpp |  0
 ...ed_backward_fp16_masktype_1_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_fp16_masktype_1_with_attnbias.hip | 15 +++++++++++++++
 ...ped_backward_fp16_masktype_2_no_attnbias.cpp |  0
 ...uped_backward_fp16_masktype_2_no_attnbias.cu | 14 ++++++++++++++
 ...ped_backward_fp16_masktype_2_no_attnbias.hip | 15 +++++++++++++++
 ...d_backward_fp16_masktype_2_with_attnbias.cpp |  0
 ...ed_backward_fp16_masktype_2_with_attnbias.cu | 14 ++++++++++++++
 ...d_backward_fp16_masktype_2_with_attnbias.hip | 15 +++++++++++++++
 ...uped_forward_bp16_masktype_0_no_attnbias.cpp |  0
 ...ouped_forward_bp16_masktype_0_no_attnbias.cu |  7 +++++++
 ...uped_forward_bp16_masktype_0_no_attnbias.hip |  8 ++++++++
 ...ed_forward_bp16_masktype_0_with_attnbias.cpp |  0
 ...ped_forward_bp16_masktype_0_with_attnbias.cu |  7 +++++++
 ...ed_forward_bp16_masktype_0_with_attnbias.hip |  8 ++++++++
 ...uped_forward_bp16_masktype_1_no_attnbias.cpp |  0
 ...ouped_forward_bp16_masktype_1_no_attnbias.cu |  7 +++++++
 ...uped_forward_bp16_masktype_1_no_attnbias.hip |  8 ++++++++
 ...ed_forward_bp16_masktype_1_with_attnbias.cpp |  0
 ...ped_forward_bp16_masktype_1_with_attnbias.cu |  7 +++++++
 ...ed_forward_bp16_masktype_1_with_attnbias.hip |  8 ++++++++
 ...uped_forward_bp16_masktype_2_no_attnbias.cpp |  0
 ...ouped_forward_bp16_masktype_2_no_attnbias.cu |  7 +++++++
 ...uped_forward_bp16_masktype_2_no_attnbias.hip |  8 ++++++++
 ...ed_forward_bp16_masktype_2_with_attnbias.cpp |  0
 ...ped_forward_bp16_masktype_2_with_attnbias.cu |  7 +++++++
 ...ed_forward_bp16_masktype_2_with_attnbias.hip |  8 ++++++++
 ...uped_forward_fp16_masktype_0_no_attnbias.cpp |  0
 ...ouped_forward_fp16_masktype_0_no_attnbias.cu |  7 +++++++
 ...uped_forward_fp16_masktype_0_no_attnbias.hip |  8 ++++++++
 ...ed_forward_fp16_masktype_0_with_attnbias.cpp |  0
 ...ped_forward_fp16_masktype_0_with_attnbias.cu |  7 +++++++
 ...ed_forward_fp16_masktype_0_with_attnbias.hip |  8 ++++++++
 ...uped_forward_fp16_masktype_1_no_attnbias.cpp |  0
 ...ouped_forward_fp16_masktype_1_no_attnbias.cu |  7 +++++++
 ...uped_forward_fp16_masktype_1_no_attnbias.hip |  8 ++++++++
 ...ed_forward_fp16_masktype_1_with_attnbias.cpp |  0
 ...ped_forward_fp16_masktype_1_with_attnbias.cu |  7 +++++++
 ...ed_forward_fp16_masktype_1_with_attnbias.hip |  8 ++++++++
 ...uped_forward_fp16_masktype_2_no_attnbias.cpp |  0
 ...ouped_forward_fp16_masktype_2_no_attnbias.cu |  7 +++++++
 ...uped_forward_fp16_masktype_2_no_attnbias.hip |  8 ++++++++
 ...ed_forward_fp16_masktype_2_with_attnbias.cpp |  0
 ...ped_forward_fp16_masktype_2_with_attnbias.cu |  7 +++++++
 ...ed_forward_fp16_masktype_2_with_attnbias.hip |  8 ++++++++
 ...rouped_infer_bp16_masktype_0_no_attnbias.cpp |  0
 ...grouped_infer_bp16_masktype_0_no_attnbias.cu |  8 ++++++++
 ...rouped_infer_bp16_masktype_0_no_attnbias.hip |  9 +++++++++
 ...uped_infer_bp16_masktype_0_with_attnbias.cpp |  0
 ...ouped_infer_bp16_masktype_0_with_attnbias.cu |  8 ++++++++
 ...uped_infer_bp16_masktype_0_with_attnbias.hip |  9 +++++++++
 ...rouped_infer_bp16_masktype_1_no_attnbias.cpp |  0
 ...grouped_infer_bp16_masktype_1_no_attnbias.cu |  8 ++++++++
 ...rouped_infer_bp16_masktype_1_no_attnbias.hip |  9 +++++++++
 ...uped_infer_bp16_masktype_1_with_attnbias.cpp |  0
 ...ouped_infer_bp16_masktype_1_with_attnbias.cu |  8 ++++++++
 ...uped_infer_bp16_masktype_1_with_attnbias.hip |  9 +++++++++
 ...rouped_infer_bp16_masktype_2_no_attnbias.cpp |  0
 ...grouped_infer_bp16_masktype_2_no_attnbias.cu |  8 ++++++++
 ...rouped_infer_bp16_masktype_2_no_attnbias.hip |  9 +++++++++
 ...uped_infer_bp16_masktype_2_with_attnbias.cpp |  0
 ...ouped_infer_bp16_masktype_2_with_attnbias.cu |  8 ++++++++
 ...uped_infer_bp16_masktype_2_with_attnbias.hip |  9 +++++++++
 ...rouped_infer_fp16_masktype_0_no_attnbias.cpp |  0
 ...grouped_infer_fp16_masktype_0_no_attnbias.cu |  8 ++++++++
 ...rouped_infer_fp16_masktype_0_no_attnbias.hip |  9 +++++++++
 ...uped_infer_fp16_masktype_0_with_attnbias.cpp |  0
 ...ouped_infer_fp16_masktype_0_with_attnbias.cu |  8 ++++++++
 ...uped_infer_fp16_masktype_0_with_attnbias.hip |  9 +++++++++
 ...rouped_infer_fp16_masktype_1_no_attnbias.cpp |  0
 ...grouped_infer_fp16_masktype_1_no_attnbias.cu |  8 ++++++++
 ...rouped_infer_fp16_masktype_1_no_attnbias.hip |  9 +++++++++
 ...uped_infer_fp16_masktype_1_with_attnbias.cpp |  0
 ...ouped_infer_fp16_masktype_1_with_attnbias.cu |  8 ++++++++
 ...uped_infer_fp16_masktype_1_with_attnbias.hip |  9 +++++++++
 ...rouped_infer_fp16_masktype_2_no_attnbias.cpp |  0
 ...grouped_infer_fp16_masktype_2_no_attnbias.cu |  8 ++++++++
 ...rouped_infer_fp16_masktype_2_no_attnbias.hip |  9 +++++++++
 ...uped_infer_fp16_masktype_2_with_attnbias.cpp |  0
 ...ouped_infer_fp16_masktype_2_with_attnbias.cu |  8 ++++++++
 ...uped_infer_fp16_masktype_2_with_attnbias.hip |  9 +++++++++
 217 files changed, 1470 insertions(+), 2 deletions(-)
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip
 rename xformers/csrc/attention/hip_fmha/{ => instances}/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp (100%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip

diff --git a/setup.py b/setup.py
index 647e09620..01d86ee25 100644
--- a/setup.py
+++ b/setup.py
@@ -208,7 +208,7 @@ def get_extensions():
     source_cuda += glob.glob(os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True)
     source_cuda += glob.glob(os.path.join(extensions_dir, "indexing", "**", "*.cu"), recursive=True)
     source_cuda += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True)
-    source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "*.cpp"), recursive=True) 
+    source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cpp"), recursive=True) 
 
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
@@ -293,7 +293,7 @@ def get_extensions():
         ]
     elif torch.cuda.is_available() and torch.version.hip: 
        rename_cpp_cu(source_hip)
-       source_hip_cu = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "*.cu"), recursive=True) 
+       source_hip_cu = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cu"), recursive=True) 
        extension = CUDAExtension
        sources += source_hip_cu
        include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha', 
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..2bf962a9f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..c893e70b5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..b3c5bbf70
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..a8b22c95d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..4a96b4a3d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..1301eb069
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..37ec0f03c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..6dda0e1b7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..c80a47952
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..3dda04d56
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..c1dc61c5a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..884503c01
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..46caaa20d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..43c7ff74d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..c328beb8d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..f66299704
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..2897cba5d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..1c44a9b84
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..62b82e22a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,16 @@
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..5a81dfaf7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,17 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..1ea6309d6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..f1ee519f9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..24f2ce4b2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..a3c6fd4fe
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck.hpp>
+#include "ck_fmha_batched_backward_hip.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..140cffce0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..eaa1cd077
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..bb32b63ef
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..baf0d8a2a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..6ba23b3a2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..3e925436b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..400df0b3d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..5d597449a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..a99486148
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..e0c5a0440
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..23305b07a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..6a6e7ce9a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..a9dd771de
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..c7c05a095
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..f653451ab
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..eded87fe6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..5ca4b7dda
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..f63d16f63
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..f9af4528d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..3eafb95c7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..44e98d9a3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..a85e2fb9a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..8dfc288f8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..a0bcb1f8e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_forward_hip.h"
+
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..9748955e1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..84bf207fa
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..418f925c2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..bb56f5423
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..a7cdb48b8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..2286068d5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..578855b9b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..6e65ed8d8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..35e9bca9c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..228d411d7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..e27e3b5ff
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..03658b015
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..5c83b0abd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..ec48f9d83
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..11c76b35f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..66f135619
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..b13f5a4c9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..76e186c0b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..12f5991c4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..922e9a0d7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..8d45859e5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..5b32d22c4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..9f03be2b5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..3382cadb7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer_hip.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..1b261e938
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..ae627167e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..8cb42c808
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..e25431de4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..ebefe8bab
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..f2eeaede4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..1d7de293e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..1ca61d4b7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..524fb30e5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..6910a6703
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..58f2f8b1a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..90359f124
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..1098e69be
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..ef6197b44
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..60583a859
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..3dbdf04b7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..b8aabeb86
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..f76ea2c12
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..8629a947a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..42ef3f534
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..00b0f5c32
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..8a5ef7d02
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..8b6112aba
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,14 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..68e4d564d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,15 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward_hip.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..bfde13c7d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..9f60df93c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..85e853c36
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..1154b074b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..d86afa1aa
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..285fef03e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..dd58b5b28
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..16df2be7d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..085245c08
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..e89ff54aa
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..8c3ea29a4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..9e7ebe753
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..19adc3971
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..ee425b155
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..6da5508d3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..8bea44444
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..f97de6fb3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..2cb989ee7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..5bd33901b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..faa22debf
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..155c9eb6c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..dbd9c7424
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..29f3ed1a3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..d67039c69
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,8 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_forward_hip.h"
+
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..973213413
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..da5eb15a5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..96e0ba425
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..4cfaba313
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..332724e73
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..76237a595
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..cb1120f5b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..712d61922
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..51ed70cab
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..eae026e23
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..c157e89c1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..682f3e97e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu
new file mode 100644
index 000000000..bbcd3ab0e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip
new file mode 100644
index 000000000..c1fbe2d06
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu
new file mode 100644
index 000000000..e320f5de6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip
new file mode 100644
index 000000000..3e8dbbe7e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu
new file mode 100644
index 000000000..e763dde6a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip
new file mode 100644
index 000000000..e302c675d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu
new file mode 100644
index 000000000..3ec2d41da
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip
new file mode 100644
index 000000000..52666509b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu
new file mode 100644
index 000000000..dee7a0845
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip
new file mode 100644
index 000000000..c1a0026b3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu
new file mode 100644
index 000000000..b5515e9a0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip
new file mode 100644
index 000000000..035531ad3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip
@@ -0,0 +1,9 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer_hip.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);

From 5166c78185296651052210b0dd0f5084d19c62a8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 8 Nov 2023 15:21:17 +0000
Subject: [PATCH 192/641] Split backward instance .cpp files

---
 ...ha_batched_backward_bp16_masktype_0_no_attnbias.cpp |  8 +-------
 ..._backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._batched_backward_bp16_masktype_0_with_attnbias.cpp |  8 +-------
 ...ackward_bp16_masktype_0_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_batched_backward_bp16_masktype_1_no_attnbias.cpp |  8 +-------
 ..._backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._batched_backward_bp16_masktype_1_with_attnbias.cpp |  8 +-------
 ...ackward_bp16_masktype_1_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_batched_backward_bp16_masktype_2_no_attnbias.cpp |  8 +-------
 ..._backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._batched_backward_bp16_masktype_2_with_attnbias.cpp |  8 +-------
 ...ackward_bp16_masktype_2_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_batched_backward_fp16_masktype_0_no_attnbias.cpp |  8 +-------
 ..._backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._batched_backward_fp16_masktype_0_with_attnbias.cpp |  8 +-------
 ...ackward_fp16_masktype_0_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_batched_backward_fp16_masktype_1_no_attnbias.cpp |  8 +-------
 ..._backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._batched_backward_fp16_masktype_1_with_attnbias.cpp | 10 +---------
 ...ackward_fp16_masktype_1_with_attnbias_fp32_grad.cpp | 10 ++++++++++
 ...ha_batched_backward_fp16_masktype_2_no_attnbias.cpp |  8 +-------
 ..._backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._batched_backward_fp16_masktype_2_with_attnbias.cpp |  8 +-------
 ...ackward_fp16_masktype_2_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_grouped_backward_bp16_masktype_0_no_attnbias.cpp |  6 ------
 ..._backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._grouped_backward_bp16_masktype_0_with_attnbias.cpp |  6 ------
 ...ackward_bp16_masktype_0_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_grouped_backward_bp16_masktype_1_no_attnbias.cpp |  6 ------
 ..._backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._grouped_backward_bp16_masktype_1_with_attnbias.cpp |  6 ------
 ...ackward_bp16_masktype_1_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_grouped_backward_bp16_masktype_2_no_attnbias.cpp |  6 ------
 ..._backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._grouped_backward_bp16_masktype_2_with_attnbias.cpp |  6 ------
 ...ackward_bp16_masktype_2_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_grouped_backward_fp16_masktype_0_no_attnbias.cpp |  6 ------
 ..._backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._grouped_backward_fp16_masktype_0_with_attnbias.cpp |  6 ------
 ...ackward_fp16_masktype_0_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_grouped_backward_fp16_masktype_1_no_attnbias.cpp |  6 ------
 ..._backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._grouped_backward_fp16_masktype_1_with_attnbias.cpp |  6 ------
 ...ackward_fp16_masktype_1_with_attnbias_fp32_grad.cpp |  8 ++++++++
 ...ha_grouped_backward_fp16_masktype_2_no_attnbias.cpp |  6 ------
 ..._backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp |  8 ++++++++
 ..._grouped_backward_fp16_masktype_2_with_attnbias.cpp |  6 ------
 ...ackward_fp16_masktype_2_with_attnbias_fp32_grad.cpp |  8 ++++++++
 48 files changed, 206 insertions(+), 158 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp

diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
index 2bf962a9f..8eb17a9f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..670398c1e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
index b3c5bbf70..1dbab2746 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..ba06daf03
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
index 4a96b4a3d..97b4eb36a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..8458f70ae
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
index 37ec0f03c..d7b92c451 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..1c1167c58
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
index c80a47952..9dbae4cac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..f38a2c7b8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
index c1dc61c5a..522e2951a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..041e4d4df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
index 46caaa20d..bc9a2948d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..e654ca13a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
index c328beb8d..4a2376a72 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..66765de59
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
index 2897cba5d..9609900d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..aa4d7ff70
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
index 62b82e22a..72715c6dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
@@ -1,14 +1,6 @@
-#include <ck.hpp>
-#include <stdexcept>
-
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..7e6245db4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,10 @@
+#include <ck/ck.hpp>
+#include <stdexcept>
+
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
index 1ea6309d6..d2707dde7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..598db5503
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
index 24f2ce4b2..28640755d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
@@ -1,12 +1,6 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
 template void run_batched_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..d3922d621
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_batched_backward.h"
+
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
index 1b261e938..82d7b1f00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..2327c6c3c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
index 8cb42c808..945a91a99 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..ea443ab4b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
index ebefe8bab..daa0dc1c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..b8273b2d6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
index 1d7de293e..6496bca76 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..d2cf1d5df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
index 524fb30e5..7ae9b06f5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..13a1bd476
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
index 58f2f8b1a..01d292154 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::bhalf_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..22ec35865
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
index 1098e69be..ad20325d7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..3ca75bc61
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
index 60583a859..cd9bd1689 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     0,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..8cbdcc253
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
index b8aabeb86..2241fb932 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..b82218a58
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
index 8629a947a..914b28d27 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     1,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..c1eef0cec
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
index 00b0f5c32..d97a398ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..5d21721d3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
index 8b6112aba..0cfac6111 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
@@ -1,12 +1,6 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
 template void run_grouped_backward_masktype_attnbias_dispatched<
     ck::half_t,
     2,
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
new file mode 100644
index 000000000..551a46c9c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+#include "ck_fmha_grouped_backward.h"
+
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);

From 1d3f7e625c4724e64fe6b0c50d3beae96e6ef4c8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 9 Nov 2023 12:48:30 +0000
Subject: [PATCH 193/641] Update to .gitignore

---
 .gitignore                                      |  2 ++
 ...ched_backward_bp16_masktype_0_no_attnbias.cu | 14 --------------
 ...hed_backward_bp16_masktype_0_no_attnbias.hip | 15 ---------------
 ...ed_backward_bp16_masktype_0_with_attnbias.cu | 14 --------------
 ...d_backward_bp16_masktype_0_with_attnbias.hip | 15 ---------------
 ...ched_backward_bp16_masktype_1_no_attnbias.cu | 14 --------------
 ...hed_backward_bp16_masktype_1_no_attnbias.hip | 15 ---------------
 ...ed_backward_bp16_masktype_1_with_attnbias.cu | 14 --------------
 ...d_backward_bp16_masktype_1_with_attnbias.hip | 15 ---------------
 ...ched_backward_bp16_masktype_2_no_attnbias.cu | 14 --------------
 ...hed_backward_bp16_masktype_2_no_attnbias.hip | 15 ---------------
 ...ed_backward_bp16_masktype_2_with_attnbias.cu | 14 --------------
 ...d_backward_bp16_masktype_2_with_attnbias.hip | 15 ---------------
 ...ched_backward_fp16_masktype_0_no_attnbias.cu | 14 --------------
 ...hed_backward_fp16_masktype_0_no_attnbias.hip | 15 ---------------
 ...ed_backward_fp16_masktype_0_with_attnbias.cu | 14 --------------
 ...d_backward_fp16_masktype_0_with_attnbias.hip | 15 ---------------
 ...ched_backward_fp16_masktype_1_no_attnbias.cu | 14 --------------
 ...hed_backward_fp16_masktype_1_no_attnbias.hip | 15 ---------------
 ...ed_backward_fp16_masktype_1_with_attnbias.cu | 16 ----------------
 ...d_backward_fp16_masktype_1_with_attnbias.hip | 17 -----------------
 ...ched_backward_fp16_masktype_2_no_attnbias.cu | 14 --------------
 ...hed_backward_fp16_masktype_2_no_attnbias.hip | 15 ---------------
 ...ed_backward_fp16_masktype_2_with_attnbias.cu | 14 --------------
 ...d_backward_fp16_masktype_2_with_attnbias.hip | 15 ---------------
 ...tched_forward_bp16_masktype_0_no_attnbias.cu |  7 -------
 ...ched_forward_bp16_masktype_0_no_attnbias.hip |  8 --------
 ...hed_forward_bp16_masktype_0_with_attnbias.cu |  7 -------
 ...ed_forward_bp16_masktype_0_with_attnbias.hip |  8 --------
 ...tched_forward_bp16_masktype_1_no_attnbias.cu |  7 -------
 ...ched_forward_bp16_masktype_1_no_attnbias.hip |  8 --------
 ...hed_forward_bp16_masktype_1_with_attnbias.cu |  7 -------
 ...ed_forward_bp16_masktype_1_with_attnbias.hip |  8 --------
 ...tched_forward_bp16_masktype_2_no_attnbias.cu |  7 -------
 ...ched_forward_bp16_masktype_2_no_attnbias.hip |  8 --------
 ...hed_forward_bp16_masktype_2_with_attnbias.cu |  7 -------
 ...ed_forward_bp16_masktype_2_with_attnbias.hip |  8 --------
 ...tched_forward_fp16_masktype_0_no_attnbias.cu |  7 -------
 ...ched_forward_fp16_masktype_0_no_attnbias.hip |  8 --------
 ...hed_forward_fp16_masktype_0_with_attnbias.cu |  7 -------
 ...ed_forward_fp16_masktype_0_with_attnbias.hip |  8 --------
 ...tched_forward_fp16_masktype_1_no_attnbias.cu |  7 -------
 ...ched_forward_fp16_masktype_1_no_attnbias.hip |  8 --------
 ...hed_forward_fp16_masktype_1_with_attnbias.cu |  7 -------
 ...ed_forward_fp16_masktype_1_with_attnbias.hip |  8 --------
 ...tched_forward_fp16_masktype_2_no_attnbias.cu |  7 -------
 ...ched_forward_fp16_masktype_2_no_attnbias.hip |  8 --------
 ...hed_forward_fp16_masktype_2_with_attnbias.cu |  7 -------
 ...ed_forward_fp16_masktype_2_with_attnbias.hip |  8 --------
 ...batched_infer_bp16_masktype_0_no_attnbias.cu |  8 --------
 ...atched_infer_bp16_masktype_0_no_attnbias.hip |  9 ---------
 ...tched_infer_bp16_masktype_0_with_attnbias.cu |  8 --------
 ...ched_infer_bp16_masktype_0_with_attnbias.hip |  9 ---------
 ...batched_infer_bp16_masktype_1_no_attnbias.cu |  8 --------
 ...atched_infer_bp16_masktype_1_no_attnbias.hip |  9 ---------
 ...tched_infer_bp16_masktype_1_with_attnbias.cu |  8 --------
 ...ched_infer_bp16_masktype_1_with_attnbias.hip |  9 ---------
 ...batched_infer_bp16_masktype_2_no_attnbias.cu |  8 --------
 ...atched_infer_bp16_masktype_2_no_attnbias.hip |  9 ---------
 ...tched_infer_bp16_masktype_2_with_attnbias.cu |  8 --------
 ...ched_infer_bp16_masktype_2_with_attnbias.hip |  9 ---------
 ...batched_infer_fp16_masktype_0_no_attnbias.cu |  8 --------
 ...atched_infer_fp16_masktype_0_no_attnbias.hip |  9 ---------
 ...tched_infer_fp16_masktype_0_with_attnbias.cu |  8 --------
 ...ched_infer_fp16_masktype_0_with_attnbias.hip |  9 ---------
 ...batched_infer_fp16_masktype_1_no_attnbias.cu |  8 --------
 ...atched_infer_fp16_masktype_1_no_attnbias.hip |  9 ---------
 ...tched_infer_fp16_masktype_1_with_attnbias.cu |  8 --------
 ...ched_infer_fp16_masktype_1_with_attnbias.hip |  9 ---------
 ...batched_infer_fp16_masktype_2_no_attnbias.cu |  8 --------
 ...atched_infer_fp16_masktype_2_no_attnbias.hip |  9 ---------
 ...tched_infer_fp16_masktype_2_with_attnbias.cu |  8 --------
 ...ched_infer_fp16_masktype_2_with_attnbias.hip |  9 ---------
 ...uped_backward_bp16_masktype_0_no_attnbias.cu | 14 --------------
 ...ped_backward_bp16_masktype_0_no_attnbias.hip | 15 ---------------
 ...ed_backward_bp16_masktype_0_with_attnbias.cu | 14 --------------
 ...d_backward_bp16_masktype_0_with_attnbias.hip | 15 ---------------
 ...uped_backward_bp16_masktype_1_no_attnbias.cu | 14 --------------
 ...ped_backward_bp16_masktype_1_no_attnbias.hip | 15 ---------------
 ...ed_backward_bp16_masktype_1_with_attnbias.cu | 14 --------------
 ...d_backward_bp16_masktype_1_with_attnbias.hip | 15 ---------------
 ...uped_backward_bp16_masktype_2_no_attnbias.cu | 14 --------------
 ...ped_backward_bp16_masktype_2_no_attnbias.hip | 15 ---------------
 ...ed_backward_bp16_masktype_2_with_attnbias.cu | 14 --------------
 ...d_backward_bp16_masktype_2_with_attnbias.hip | 15 ---------------
 ...uped_backward_fp16_masktype_0_no_attnbias.cu | 14 --------------
 ...ped_backward_fp16_masktype_0_no_attnbias.hip | 15 ---------------
 ...ed_backward_fp16_masktype_0_with_attnbias.cu | 14 --------------
 ...d_backward_fp16_masktype_0_with_attnbias.hip | 15 ---------------
 ...uped_backward_fp16_masktype_1_no_attnbias.cu | 14 --------------
 ...ped_backward_fp16_masktype_1_no_attnbias.hip | 15 ---------------
 ...ed_backward_fp16_masktype_1_with_attnbias.cu | 14 --------------
 ...d_backward_fp16_masktype_1_with_attnbias.hip | 15 ---------------
 ...uped_backward_fp16_masktype_2_no_attnbias.cu | 14 --------------
 ...ped_backward_fp16_masktype_2_no_attnbias.hip | 15 ---------------
 ...ed_backward_fp16_masktype_2_with_attnbias.cu | 14 --------------
 ...d_backward_fp16_masktype_2_with_attnbias.hip | 15 ---------------
 ...ouped_forward_bp16_masktype_0_no_attnbias.cu |  7 -------
 ...uped_forward_bp16_masktype_0_no_attnbias.hip |  8 --------
 ...ped_forward_bp16_masktype_0_with_attnbias.cu |  7 -------
 ...ed_forward_bp16_masktype_0_with_attnbias.hip |  8 --------
 ...ouped_forward_bp16_masktype_1_no_attnbias.cu |  7 -------
 ...uped_forward_bp16_masktype_1_no_attnbias.hip |  8 --------
 ...ped_forward_bp16_masktype_1_with_attnbias.cu |  7 -------
 ...ed_forward_bp16_masktype_1_with_attnbias.hip |  8 --------
 ...ouped_forward_bp16_masktype_2_no_attnbias.cu |  7 -------
 ...uped_forward_bp16_masktype_2_no_attnbias.hip |  8 --------
 ...ped_forward_bp16_masktype_2_with_attnbias.cu |  7 -------
 ...ed_forward_bp16_masktype_2_with_attnbias.hip |  8 --------
 ...ouped_forward_fp16_masktype_0_no_attnbias.cu |  7 -------
 ...uped_forward_fp16_masktype_0_no_attnbias.hip |  8 --------
 ...ped_forward_fp16_masktype_0_with_attnbias.cu |  7 -------
 ...ed_forward_fp16_masktype_0_with_attnbias.hip |  8 --------
 ...ouped_forward_fp16_masktype_1_no_attnbias.cu |  7 -------
 ...uped_forward_fp16_masktype_1_no_attnbias.hip |  8 --------
 ...ped_forward_fp16_masktype_1_with_attnbias.cu |  7 -------
 ...ed_forward_fp16_masktype_1_with_attnbias.hip |  8 --------
 ...ouped_forward_fp16_masktype_2_no_attnbias.cu |  7 -------
 ...uped_forward_fp16_masktype_2_no_attnbias.hip |  8 --------
 ...ped_forward_fp16_masktype_2_with_attnbias.cu |  7 -------
 ...ed_forward_fp16_masktype_2_with_attnbias.hip |  8 --------
 ...grouped_infer_bp16_masktype_0_no_attnbias.cu |  8 --------
 ...rouped_infer_bp16_masktype_0_no_attnbias.hip |  9 ---------
 ...ouped_infer_bp16_masktype_0_with_attnbias.cu |  8 --------
 ...uped_infer_bp16_masktype_0_with_attnbias.hip |  9 ---------
 ...grouped_infer_bp16_masktype_1_no_attnbias.cu |  8 --------
 ...rouped_infer_bp16_masktype_1_no_attnbias.hip |  9 ---------
 ...ouped_infer_bp16_masktype_1_with_attnbias.cu |  8 --------
 ...uped_infer_bp16_masktype_1_with_attnbias.hip |  9 ---------
 ...grouped_infer_bp16_masktype_2_no_attnbias.cu |  8 --------
 ...rouped_infer_bp16_masktype_2_no_attnbias.hip |  9 ---------
 ...ouped_infer_bp16_masktype_2_with_attnbias.cu |  8 --------
 ...uped_infer_bp16_masktype_2_with_attnbias.hip |  9 ---------
 ...grouped_infer_fp16_masktype_0_no_attnbias.cu |  8 --------
 ...rouped_infer_fp16_masktype_0_no_attnbias.hip |  9 ---------
 ...ouped_infer_fp16_masktype_0_with_attnbias.cu |  8 --------
 ...uped_infer_fp16_masktype_0_with_attnbias.hip |  9 ---------
 ...grouped_infer_fp16_masktype_1_no_attnbias.cu |  8 --------
 ...rouped_infer_fp16_masktype_1_no_attnbias.hip |  9 ---------
 ...ouped_infer_fp16_masktype_1_with_attnbias.cu |  8 --------
 ...uped_infer_fp16_masktype_1_with_attnbias.hip |  9 ---------
 ...grouped_infer_fp16_masktype_2_no_attnbias.cu |  8 --------
 ...rouped_infer_fp16_masktype_2_no_attnbias.hip |  9 ---------
 ...ouped_infer_fp16_masktype_2_with_attnbias.cu |  8 --------
 ...uped_infer_fp16_masktype_2_with_attnbias.hip |  9 ---------
 145 files changed, 2 insertions(+), 1468 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip

diff --git a/.gitignore b/.gitignore
index 56869b496..96cc37bb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,5 +65,7 @@ xformers/cpp_lib.json
 xformers/csrc/attention/hip_fmha/*.cu
 xformers/csrc/attention/hip_fmha/*.hip
 xformers/csrc/attention/hip_fmha/*_hip.h
+xformers/csrc/attention/hip_fmha/instances/*.cu
+xformers/csrc/attention/hip_fmha/instances/*.hip
 
 
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 2bf962a9f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index c893e70b5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index b3c5bbf70..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index a8b22c95d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index 4a96b4a3d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 1301eb069..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 37ec0f03c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 6dda0e1b7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index c80a47952..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index 3dda04d56..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index c1dc61c5a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 884503c01..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 46caaa20d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index 43c7ff74d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index c328beb8d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index f66299704..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index 2897cba5d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 1c44a9b84..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 62b82e22a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <ck.hpp>
-#include <stdexcept>
-
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 5a81dfaf7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,17 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include <stdexcept>
-
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 1ea6309d6..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index f1ee519f9..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 24f2ce4b2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index a3c6fd4fe..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck.hpp>
-#include "ck_fmha_batched_backward_hip.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 140cffce0..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index eaa1cd077..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index bb32b63ef..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index baf0d8a2a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index 6ba23b3a2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 3e925436b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 400df0b3d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 5d597449a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index a99486148..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index e0c5a0440..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 23305b07a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 6a6e7ce9a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index a9dd771de..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index c7c05a095..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index f653451ab..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index eded87fe6..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index 5ca4b7dda..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index f63d16f63..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index f9af4528d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 3eafb95c7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 44e98d9a3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index a85e2fb9a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 8dfc288f8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index a0bcb1f8e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward_hip.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 9748955e1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index 84bf207fa..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 418f925c2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index bb56f5423..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index a7cdb48b8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 2286068d5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 578855b9b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 6e65ed8d8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 35e9bca9c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index 228d411d7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index e27e3b5ff..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 03658b015..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 5c83b0abd..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index ec48f9d83..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 11c76b35f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index 66f135619..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index b13f5a4c9..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 76e186c0b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 12f5991c4..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 922e9a0d7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 8d45859e5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index 5b32d22c4..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 9f03be2b5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 3382cadb7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer_hip.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 1b261e938..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index ae627167e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 8cb42c808..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index e25431de4..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index ebefe8bab..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index f2eeaede4..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 1d7de293e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 1ca61d4b7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 524fb30e5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index 6910a6703..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 58f2f8b1a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 90359f124..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 1098e69be..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index ef6197b44..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 60583a859..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index 3dbdf04b7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index b8aabeb86..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index f76ea2c12..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 8629a947a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 42ef3f534..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 00b0f5c32..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index 8a5ef7d02..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 8b6112aba..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 68e4d564d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,15 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward_hip.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index bfde13c7d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index 9f60df93c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 85e853c36..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index 1154b074b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index d86afa1aa..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 285fef03e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index dd58b5b28..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 16df2be7d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 085245c08..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index e89ff54aa..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 8c3ea29a4..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 9e7ebe753..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 19adc3971..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index ee425b155..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 6da5508d3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index 8bea44444..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index f97de6fb3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 2cb989ee7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 5bd33901b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index faa22debf..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 155c9eb6c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index dbd9c7424..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index 29f3ed1a3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index d67039c69..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,8 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward_hip.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index 973213413..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index da5eb15a5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index 96e0ba425..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index 4cfaba313..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index 332724e73..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index 76237a595..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index cb1120f5b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 712d61922..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index 51ed70cab..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index eae026e23..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index c157e89c1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 682f3e97e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu
deleted file mode 100644
index bbcd3ab0e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip
deleted file mode 100644
index c1fbe2d06..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu
deleted file mode 100644
index e320f5de6..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip
deleted file mode 100644
index 3e8dbbe7e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu
deleted file mode 100644
index e763dde6a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip
deleted file mode 100644
index e302c675d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu
deleted file mode 100644
index 3ec2d41da..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip
deleted file mode 100644
index 52666509b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu
deleted file mode 100644
index dee7a0845..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip
deleted file mode 100644
index c1a0026b3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu
deleted file mode 100644
index b5515e9a0..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cu
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip
deleted file mode 100644
index 035531ad3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.hip
+++ /dev/null
@@ -1,9 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer_hip.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);

From 58f6bbf76484387815bf8e457d5c8fb32d73e8d4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 9 Nov 2023 17:49:48 +0000
Subject: [PATCH 194/641] Tuning the device-op template parameters for infer
 and forward

---
 .../attention/hip_fmha/ck_fmha_batched_forward.h |  7 ++++---
 .../attention/hip_fmha/ck_fmha_batched_infer.h   |  2 +-
 .../hip_fmha/ck_fmha_forward_gemm_constants.h    | 16 ++++++++--------
 .../attention/hip_fmha/ck_fmha_grouped_forward.h |  2 +-
 .../attention/hip_fmha/ck_fmha_grouped_infer.h   |  2 +-
 .../hip_fmha/ck_fmha_infer_gemm_constants.h      | 16 ++++++++--------
 6 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 7b5193256..93df407da 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -179,7 +179,7 @@ struct batched_forward_masktype_attnbias_dispatched {
         "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
 
     constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(4, thread_slice_length_ak1);
+        min(8, thread_slice_length_ak1);
 
     BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
@@ -362,8 +362,9 @@ struct batched_forward_masktype_attnbias_dispatched {
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_forward_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
-{
+void run_batched_forward_masktype_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
   batched_forward_masktype_attnbias_dispatched<
       scalar_t,
       custom_mask_type,
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index c76a30b73..59666a0f8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -168,7 +168,7 @@ struct batched_infer_masktype_attnbias_dispatched {
         "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
 
     constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(4, thread_slice_length_ak1);
+        min(8, thread_slice_length_ak1);
 
     BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index 5a1790b5f..7f65aeb3f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -13,8 +13,8 @@ struct GemmOpConstantsBatchedForward {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 4;
-  static constexpr ck::index_t BK1 = 4;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
@@ -22,14 +22,14 @@ struct GemmOpConstantsBatchedForward {
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
   static constexpr ck::index_t DropoutStep = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
   static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
@@ -64,8 +64,8 @@ struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 4;
-  static constexpr ck::index_t BK1 = 4;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
@@ -73,14 +73,14 @@ struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
   static constexpr ck::index_t DropoutStep = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
   static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 9eebcfa14..55fb27bf4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -173,7 +173,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
         "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
 
     constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(4, thread_slice_length_ak1);
+        min(8, thread_slice_length_ak1);
 
     GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 31a90d200..5b95c75a7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -168,7 +168,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
         "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
 
     constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(4, thread_slice_length_ak1);
+        min(8, thread_slice_length_ak1);
 
     GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
       constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
index 8f492ff00..7c7ad4bee 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -14,22 +14,22 @@ struct GemmOpConstantsBatchedInfer {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 4;
-  static constexpr ck::index_t BK1 = 4;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
   static constexpr ck::index_t MXdlPerWave = 1;
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
   static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
@@ -62,22 +62,22 @@ struct GemmOpConstantsGroupedInfer {
   static constexpr ck::index_t KPerBlock = 32;
   // static constexpr ck::index_t Gemm1NPerBlock;
   static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 4;
-  static constexpr ck::index_t BK1 = 4;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
   static constexpr ck::index_t B1K1 = 2;
   static constexpr ck::index_t MPerXDL = 32;
   static constexpr ck::index_t NPerXDL = 32;
   static constexpr ck::index_t MXdlPerWave = 1;
   static constexpr ck::index_t NXdlPerWave = 4;
   // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<8, 32, 1>;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
   using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
   // static constexpr ck::index_t ABlockTransferSrcScalarPerVector,
   static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
   static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
   using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
   using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
   static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;

From 1f2af5cf35c0084a1bd90f412514e09377864d9d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 9 Nov 2023 18:44:07 +0000
Subject: [PATCH 195/641] Synchronize with latest CK flashAttention commits

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 339b86e96..ac3ef99cf 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 339b86e9682120d8aaa415203545a3cfadbbb142
+Subproject commit ac3ef99cf8f78d212143a2d63139094d207d93ae

From 8fdf105141fe1e62a52425f417094d0010f7d858 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 9 Nov 2023 20:50:50 +0000
Subject: [PATCH 196/641] Tuning the device-op template parameters for infer
 and forward again

---
 .../hip_fmha/ck_fmha_batched_forward.h        |  2 +-
 .../hip_fmha/ck_fmha_batched_infer.h          |  2 +-
 .../hip_fmha/ck_fmha_forward_gemm_constants.h | 22 +++++++++----------
 .../hip_fmha/ck_fmha_grouped_forward.h        |  2 +-
 .../hip_fmha/ck_fmha_grouped_infer.h          |  2 +-
 .../hip_fmha/ck_fmha_infer_gemm_constants.h   |  4 ++--
 6 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 93df407da..b6a98b5fc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -196,7 +196,7 @@ struct batched_forward_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(4, thread_slice_length_cshuflle_n);
 
       if constexpr (
           kB1BlockTransferSrcScalarPerVector_max >=
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index 59666a0f8..dfc17191b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -185,7 +185,7 @@ struct batched_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
+          min(4, thread_slice_length_cshuflle_n);
 
       if constexpr (
           kB1BlockTransferSrcScalarPerVector_max >=
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index 7f65aeb3f..c80ec4603 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -5,6 +5,7 @@
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
+// clang-format off
 struct GemmOpConstantsBatchedForward {
   static constexpr ck::index_t NumGemmKPrefetchStage = 1;
   static constexpr ck::index_t BlockSize = 256;
@@ -46,16 +47,15 @@ struct GemmOpConstantsBatchedForward {
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 16, 1, 16>;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector =
-      1; // not actually used by the kernel
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
+  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
+  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector = 1; // not actually used by the kernel
 };
+// clang-format on
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
+// clang-format off
 struct GemmOpConstantsGroupedForward {
   static constexpr ck::index_t NumGemmKPrefetchStage = 1;
   static constexpr ck::index_t BlockSize = 256;
@@ -97,10 +97,8 @@ struct GemmOpConstantsGroupedForward {
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
-      S<1, 16, 1, 16>;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector =
-      1; // not actually used by the kernel
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
+  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
+  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector = 1; // not actually used by the kernel
 };
+// clang-format on
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 55fb27bf4..00c92682b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -190,7 +190,7 @@ struct grouped_forward_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(1, thread_slice_length_cshuflle_n);
+          min(4, thread_slice_length_cshuflle_n);
 
       if constexpr (
           kB1BlockTransferSrcScalarPerVector_max >=
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 5b95c75a7..81c6d3381 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -185,7 +185,7 @@ struct grouped_infer_masktype_attnbias_dispatched {
                   At(I3);
 
       constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
+          min(4, thread_slice_length_cshuflle_n);
 
       if constexpr (
           kB1BlockTransferSrcScalarPerVector_max >=
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
index 7c7ad4bee..bdeb5ef85 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -46,7 +46,7 @@ struct GemmOpConstantsBatchedInfer {
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 16, 1, 16>;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
   // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 //clang-format on
@@ -94,7 +94,7 @@ struct GemmOpConstantsGroupedInfer {
   static constexpr bool B1BlockLdsExtraN = false;
   static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
   // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 16, 1, 16>;
+  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
   // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 // clang-format on

From a1a8352c70ed3216b8252cac7eb1b9ac05c8200d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 10 Nov 2023 18:40:53 +0000
Subject: [PATCH 197/641] Synchronize with latest CK flashAttention commits

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index ac3ef99cf..9a423017f 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit ac3ef99cf8f78d212143a2d63139094d207d93ae
+Subproject commit 9a423017f2335dd60bb1c1a28b6a5808fb95b917

From ab0ae4d9c6a821ead5fac069b29b6e8888baa4fa Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 14 Nov 2023 16:36:57 +0000
Subject: [PATCH 198/641] Synchronize to the latest ck-flashAttn which improved
 the performance for forward/infer

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 9a423017f..2f93e26f5 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 9a423017f2335dd60bb1c1a28b6a5808fb95b917
+Subproject commit 2f93e26f55ce0e9839c358c0c713ce8eb3db38a2

From dde88e252a9b41c06c95b439e389a7a2bf274c39 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 18:09:11 -0500
Subject: [PATCH 199/641] fix numeric limits usage

---
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index be4cc790e..442bd8c00 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -150,7 +150,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   data_vec4_t q_thread;
   load_v<decltype(q_), data_vec4_t>(q_, lane_idx, &q_thread);
   // Each block computes different B value
-  float max_qk_acc = std::numeric_limits<float>::lowest();
+  float max_qk_acc = ck::NumericLimits<float>::Lowest();
 
   // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
   // Split T across wavefronts in a block, unroll loads to expose more

From ee84791dba54e018dcf73e1813cdca9117222a40 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 19:08:35 -0500
Subject: [PATCH 200/641] bring the head dimension into op parameters and
 kernel arguments

---
 .../hip_fmha/attention_forward_decoder.cpp    |  3 +-
 .../hip_fmha/ck_attention_forward_decoder.h   | 38 ++++++++++---------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 8b5b88f03..52f830e58 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -72,7 +72,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
   TORCH_CHECK(seq_positions.is_cuda());
 
   TORCH_CHECK(cache_K.size(1) <= T_MAX);
-  TORCH_CHECK(cache_K.size(3) == D_H);
+  TORCH_CHECK(cache_K.size(3) <= D_H);
 
   auto B = XQ.size(0);
   auto H = XQ.size(2);
@@ -118,6 +118,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
             K_acc.stride(0),
             K_acc.stride(1),
             K_acc.stride(2),
+            K_acc.size(3),
             K_acc.size(2) == 1,
             qk_scale,
             blocks,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 442bd8c00..1c4e4234a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -75,20 +75,20 @@ float __device__ __forceinline__ wavefrontReduce(float val, F f) {
   return val;
 }
 
-template <typename TDataPtr, typename TDataVec>
+template <typename TData, typename TDataVec>
 __forceinline__ __device__ void load_v(
-    TDataPtr data_ptr,
+    const TData* __restrict__ data_ptr,
     int32_t vector_offset,
-    TDataVec* load_to) {
-  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+    TDataVec* __restrict__ load_to) {
+      *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
-template <typename TDataPtr, typename TDataVec>
+template <typename TData, typename TDataVec>
 __forceinline__ __device__ void store_v(
-    TDataPtr data_ptr,
+    TData* __restrict__ data_ptr,
     int32_t vector_offset,
     TDataVec value) {
-  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+      *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
 template <
@@ -108,6 +108,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     const ptrdiff_t K_stride_0,
     const ptrdiff_t K_stride_1,
     const ptrdiff_t K_stride_2,
+    const int32_t D_H,
     const bool multiquery,
     const float qk_scale) {
   static_assert(n_loop_unroll_tail < n_loop_unroll, "");
@@ -133,7 +134,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const int32_t thread_linear_idx =
       lane_idx + wavefront_idx * threads_per_wavefront;
 
-  // Need D_H == 256 (NB: 128 in CUDA because of wavefront/warp sizes 64/32)
   // const auto* q_ = &(XQ_acc[b][0][h][0]);
   const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
   const auto* q_ = XQ + XQO_base_offset;
@@ -148,7 +148,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   using data_t = scalar_t;
   using data_vec4_t = typename ck::vector_type<data_t, 4>::type;
   data_vec4_t q_thread;
-  load_v<decltype(q_), data_vec4_t>(q_, lane_idx, &q_thread);
+  load_v<data_t, data_vec4_t>(q_, lane_idx, &q_thread);
   // Each block computes different B value
   float max_qk_acc = ck::NumericLimits<float>::Lowest();
 
@@ -166,7 +166,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // load the K[b][t][h|0][:] row into registers
-      load_v<decltype(cache_K_base), data_vec4_t>(
+      load_v<data_t, data_vec4_t>(
           cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
     }
     float qk_accs[n_loop_unroll] = {};
@@ -197,7 +197,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       const int32_t t = tt + ttt;
       if (t < t_max) {
         // load the K[b][t][h|0][:] row into registers
-        load_v<decltype(cache_K_base), data_vec4_t>(
+        load_v<data_t, data_vec4_t>(
             cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       }
     }
@@ -277,7 +277,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // load the V[b][t][h|0][:] row into registers, reusing K register storage
-      load_v<decltype(cache_V_base), data_vec4_t>(
+      load_v<data_t, data_vec4_t>(
           cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       ps[ttt] = smem[t];
     }
@@ -296,7 +296,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       if (t < t_max) {
         // load the V[b][t][h|0][:] row into registers, reusing K register
         // storage
-        load_v<decltype(cache_V_base), data_vec4_t>(
+        load_v<data_t, data_vec4_t>(
             cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
@@ -315,7 +315,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   __syncthreads();
 
   // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
-  store_v<float*, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
+  store_v<float, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
 
   __syncthreads();
   // sum up partial D rows from other wavefronts
@@ -323,7 +323,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     ck::float4_t r = 0;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
       ck::float4_t partial_r;
-      load_v<float*, ck::float4_t>(
+      load_v<float, ck::float4_t>(
           smem, w * threads_per_wavefront + lane_idx, &partial_r);
       r += partial_r;
     }
@@ -333,8 +333,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     bf_r.y = ck::type_convert<data_t>(r.y);
     bf_r.z = ck::type_convert<data_t>(r.z);
     bf_r.w = ck::type_convert<data_t>(r.w);
-    auto* o_ = O + XQO_base_offset;
-    store_v<decltype(o_), data_vec4_t>(o_, lane_idx, bf_r);
+    data_t* __restrict__ o_ = O + XQO_base_offset;
+    store_v<data_t, data_vec4_t>(o_, lane_idx, bf_r);
   }
 }
 
@@ -357,6 +357,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     const ptrdiff_t K_stride_0;
     const ptrdiff_t K_stride_1;
     const ptrdiff_t K_stride_2;
+    const int32_t D_H;
     const bool multiquery;
     const float qk_scale;
 
@@ -375,6 +376,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
         const ptrdiff_t K_stride_0,
         const ptrdiff_t K_stride_1,
         const ptrdiff_t K_stride_2,
+        const int32_t D_H,
         const bool multiquery,
         const float qk_scale,
         const dim3 grid_dim,
@@ -390,6 +392,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           K_stride_0(K_stride_0),
           K_stride_1(K_stride_1),
           K_stride_2(K_stride_2),
+          D_H(D_H),
           multiquery(multiquery),
           qk_scale(qk_scale),
           grid_dim(grid_dim),
@@ -417,6 +420,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           arg.K_stride_0,
           arg.K_stride_1,
           arg.K_stride_2,
+          arg.D_H,
           arg.multiquery,
           arg.qk_scale);
     }

From 3582e221b10cedbd34ca4078447979b512cdf2c0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 19:35:38 -0500
Subject: [PATCH 201/641] refactor type names in the kernel

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 69 ++++++++++---------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 1c4e4234a..74a087bce 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -115,8 +115,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   constexpr int32_t seq_positions_shift = 0;
 
-  extern __shared__ __align__(16) float smem[];
-
   // Each block handles a single batch and head
   const int32_t b = blockIdx.x;
   const int32_t h = blockIdx.y;
@@ -145,18 +143,25 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
+  
+  constexpr int32_t vec_size = 4;
   using data_t = scalar_t;
-  using data_vec4_t = typename ck::vector_type<data_t, 4>::type;
-  data_vec4_t q_thread;
-  load_v<data_t, data_vec4_t>(q_, lane_idx, &q_thread);
+  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
+  using compute_t = float;
+  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+  extern __shared__ __align__(16) compute_t smem[];
+
+  data_vec_t q_thread;
+  load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
   // Each block computes different B value
-  float max_qk_acc = ck::NumericLimits<float>::Lowest();
+  compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
   // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
   // Split T across wavefronts in a block, unroll loads to expose more
   // parallelism.
 
-  data_vec4_t k_loads[n_loop_unroll];
+  data_vec_t k_loads[n_loop_unroll];
 
   constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
   const int32_t t_max_unroll = (t_max / dtt) * dtt;
@@ -166,18 +171,18 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // load the K[b][t][h|0][:] row into registers
-      load_v<data_t, data_vec4_t>(
+      load_v<data_t, data_vec_t>(
           cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
     }
-    float qk_accs[n_loop_unroll] = {};
+    compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      ck::inner_product<data_vec4_t, data_vec4_t, float>(
+      ck::inner_product<data_vec_t, data_vec_t, compute_t>(
           q_thread, k_loads[ttt], qk_accs[ttt]);
       qk_accs[ttt] *= qk_scale;
 
       qk_accs[ttt] =
-          wavefrontReduce(qk_accs[ttt], [](float a, float b) { return a + b; });
+          wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
       max_qk_acc = max(qk_accs[ttt], max_qk_acc);
     }
     if (lane_idx == 0) {
@@ -197,21 +202,21 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       const int32_t t = tt + ttt;
       if (t < t_max) {
         // load the K[b][t][h|0][:] row into registers
-        load_v<data_t, data_vec4_t>(
+        load_v<data_t, data_vec_t>(
             cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       }
     }
 #pragma unroll n_loop_unroll_tail
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      float qk_acc = 0;
+      compute_t qk_acc = 0;
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        ck::inner_product<data_vec4_t, data_vec4_t, float>(
+        ck::inner_product<data_vec_t, data_vec_t, compute_t>(
             q_thread, k_loads[ttt], qk_acc);
         qk_acc *= qk_scale;
 
         qk_acc =
-            wavefrontReduce(qk_acc, [](float a, float b) { return a + b; });
+            wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
         max_qk_acc = max(qk_acc, max_qk_acc);
 
         // write accumulated sums to smem.
@@ -233,15 +238,15 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   }
   // shared across all threads in block
   max_qk_acc = wavefrontReduce(
-      max_qk_acc, [](float a, float b) { return a > b ? a : b; });
+      max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
 
   // each wavefront computes partial sum of exp.
-  float softmax_denominator = 0.0f;
+  compute_t softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     softmax_denominator += __expf(smem[t] - max_qk_acc);
   }
   softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](float a, float b) { return a + b; });
+      softmax_denominator, [](auto a, auto b) { return a + b; });
 
   __syncthreads();
   if (lane_idx == 0) {
@@ -255,9 +260,9 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     softmax_denominator = smem[T_MAX + lane_idx];
   }
   softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](float a, float b) { return a + b; });
+      softmax_denominator, [](auto a, auto b) { return a + b; });
 
-  const float softmax_scale_factor = 1. / softmax_denominator;
+  const compute_t softmax_scale_factor = 1. / softmax_denominator;
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
     smem[t] = __expf(smem[t] - max_qk_acc) * softmax_scale_factor;
@@ -270,21 +275,21 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
   // outputs are of size float[D]
 
-  float ps[n_loop_unroll];
-  ck::float4_t o_acc = 0;
+  compute_t ps[n_loop_unroll];
+  compute_vec_t o_acc = 0;
   for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // load the V[b][t][h|0][:] row into registers, reusing K register storage
-      load_v<data_t, data_vec4_t>(
+      load_v<data_t, data_vec_t>(
           cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       ps[ttt] = smem[t];
     }
 
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
+      o_acc = scalar4_scale_acc<data_vec_t>(o_acc, k_loads[ttt], ps[ttt]);
     }
   }
 
@@ -296,7 +301,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       if (t < t_max) {
         // load the V[b][t][h|0][:] row into registers, reusing K register
         // storage
-        load_v<data_t, data_vec4_t>(
+        load_v<data_t, data_vec_t>(
             cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
@@ -306,7 +311,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        o_acc = scalar4_scale_acc<data_vec4_t>(o_acc, k_loads[ttt], ps[ttt]);
+        o_acc = scalar4_scale_acc<data_vec_t>(o_acc, k_loads[ttt], ps[ttt]);
       }
     }
   }
@@ -315,26 +320,26 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   __syncthreads();
 
   // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
-  store_v<float, ck::float4_t>(&smem[0], thread_linear_idx, o_acc);
+  store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
 
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
-    ck::float4_t r = 0;
+    compute_vec_t r = 0;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      ck::float4_t partial_r;
-      load_v<float, ck::float4_t>(
+      compute_vec_t partial_r;
+      load_v<compute_t, compute_vec_t>(
           smem, w * threads_per_wavefront + lane_idx, &partial_r);
       r += partial_r;
     }
     // write output D row
-    data_vec4_t bf_r;
+    data_vec_t bf_r;
     bf_r.x = ck::type_convert<data_t>(r.x);
     bf_r.y = ck::type_convert<data_t>(r.y);
     bf_r.z = ck::type_convert<data_t>(r.z);
     bf_r.w = ck::type_convert<data_t>(r.w);
     data_t* __restrict__ o_ = O + XQO_base_offset;
-    store_v<data_t, data_vec4_t>(o_, lane_idx, bf_r);
+    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r);
   }
 }
 

From d4fca23c5beafcd918663c36f315edc8bd7bc6ec Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 19:47:39 -0500
Subject: [PATCH 202/641] refactor dtype conversion from compute to data for
 the output

---
 .../hip_fmha/ck_attention_forward_decoder.h        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 74a087bce..ce18900de 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -325,19 +325,19 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
-    compute_vec_t r = 0;
+    union { compute_vec_t vec; compute_t[vec_size] arr; } r = 0;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
       compute_vec_t partial_r;
       load_v<compute_t, compute_vec_t>(
           smem, w * threads_per_wavefront + lane_idx, &partial_r);
-      r += partial_r;
+      r.vec += partial_r;
+    }
+    // elementwise convert from compute_t result to data_t out to be written
+    union { data_vec_t vec; data_t[vec_size] arr; } bf_r = 0;
+    for (int32_t i = 0; i < vec_size; ++i) {
+      bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }
     // write output D row
-    data_vec_t bf_r;
-    bf_r.x = ck::type_convert<data_t>(r.x);
-    bf_r.y = ck::type_convert<data_t>(r.y);
-    bf_r.z = ck::type_convert<data_t>(r.z);
-    bf_r.w = ck::type_convert<data_t>(r.w);
     data_t* __restrict__ o_ = O + XQO_base_offset;
     store_v<data_t, data_vec_t>(o_, lane_idx, bf_r);
   }

From 22eb2641b3cd437fe2227490ab7eb31a00c63918 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 20:27:33 -0500
Subject: [PATCH 203/641] support head dim < 256; still needs to be divisible
 by vector size

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 64 +++++++++++++------
 xformers/ops/fmha/ck_decoder.py               |  7 +-
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index ce18900de..388e30eb4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -131,7 +131,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       threads_per_wavefront * wavefronts_per_block;
   const int32_t thread_linear_idx =
       lane_idx + wavefront_idx * threads_per_wavefront;
-
+  const bool lane_active_for_io = lane_idx * vec_size < D_H;
   // const auto* q_ = &(XQ_acc[b][0][h][0]);
   const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
   const auto* q_ = XQ + XQO_base_offset;
@@ -153,7 +153,11 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   extern __shared__ __align__(16) compute_t smem[];
 
   data_vec_t q_thread;
-  load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
+  if (lane_active_for_io) {
+    load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
+  } else {
+    q_thread = 0;
+  }
   // Each block computes different B value
   compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
@@ -171,8 +175,12 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
       // load the K[b][t][h|0][:] row into registers
-      load_v<data_t, data_vec_t>(
-          cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+      if (lane_active_for_io) {
+        load_v<data_t, data_vec_t>(
+            cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+      } else {
+        k_loads[ttt] = 0;
+      }
     }
     compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
@@ -201,9 +209,13 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        // load the K[b][t][h|0][:] row into registers
-        load_v<data_t, data_vec_t>(
-            cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+        if (lane_active_for_io) {
+          // load the K[b][t][h|0][:] row into registers
+          load_v<data_t, data_vec_t>(
+              cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+        } else {
+          k_loads[ttt] = 0;
+        }
       }
     }
 #pragma unroll n_loop_unroll_tail
@@ -281,9 +293,13 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
       const int32_t t = tt + ttt;
-      // load the V[b][t][h|0][:] row into registers, reusing K register storage
-      load_v<data_t, data_vec_t>(
-          cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+      if (lane_active_for_io) {
+        // load the V[b][t][h|0][:] row into registers, reusing K register storage
+        load_v<data_t, data_vec_t>(
+            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+      } else {
+        k_loads[ttt] = 0;
+      }
       ps[ttt] = smem[t];
     }
 
@@ -301,8 +317,12 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       if (t < t_max) {
         // load the V[b][t][h|0][:] row into registers, reusing K register
         // storage
-        load_v<data_t, data_vec_t>(
-            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+        if (lane_active_for_io) {
+          load_v<data_t, data_vec_t>(
+              cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+        } else {
+          k_loads[ttt] = 0;
+        }
         ps[ttt] = smem[t];
       }
     }
@@ -320,26 +340,32 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   __syncthreads();
 
   // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
-  store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  if (lane_active_for_io) {
+    store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  }
 
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0) {
-    union { compute_vec_t vec; compute_t[vec_size] arr; } r = 0;
+    union { compute_vec_t vec = 0; compute_t arr[vec_size]; } r;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      compute_vec_t partial_r;
-      load_v<compute_t, compute_vec_t>(
-          smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      compute_vec_t partial_r = 0;
+      if (lane_active_for_io) {
+        load_v<compute_t, compute_vec_t>(
+            smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      }
       r.vec += partial_r;
     }
     // elementwise convert from compute_t result to data_t out to be written
-    union { data_vec_t vec; data_t[vec_size] arr; } bf_r = 0;
+    union { data_vec_t vec; data_t arr[vec_size]; } bf_r;
     for (int32_t i = 0; i < vec_size; ++i) {
       bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }
     // write output D row
     data_t* __restrict__ o_ = O + XQO_base_offset;
-    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r);
+    if (lane_active_for_io) {
+      store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+    }
   }
 }
 
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 28db52eaa..67e475636 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -34,8 +34,11 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             if d.query.shape[0] != 1:
                 reasons.append("One formal batch element expected")
 
-            if d.query.shape[-1] != cls.SUPPORTED_MAX_K:
-                reasons.append(f"Got head_dim={d.query.shape[-1]}; only head_dim=={cls.SUPPORTED_MAX_K} is supported for now.")
+            if d.query.shape[-1] > cls.SUPPORTED_MAX_K:
+                reasons.append(f"Got head_dim={d.query.shape[-1]}; only head_dim<={cls.SUPPORTED_MAX_K} is supported for now.")
+
+            if d.query.shape[-1] % 4 != 0:
+                reasons.append(f"Got head_dim={d.query.shape[-1]}; it needs to be divisible by 4")
 
             if d.key.stride(-1) != 1:
                 reasons.append("expect keys to have last dim contiguous")

From 7cebebd7722ba1196e1ad08c7db5c37ed28bcec5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 20:34:59 -0500
Subject: [PATCH 204/641] add more compiler annotations for unrolling and
 restrict ptrs

---
 .../attention/hip_fmha/ck_attention_forward_decoder.h    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 388e30eb4..2a82f4b36 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -134,12 +134,12 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const bool lane_active_for_io = lane_idx * vec_size < D_H;
   // const auto* q_ = &(XQ_acc[b][0][h][0]);
   const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
-  const auto* q_ = XQ + XQO_base_offset;
+  const auto* __restrict__ q_ = XQ + XQO_base_offset;
 
   const auto cache_KV_base_offset =
       b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
-  const auto* cache_K_base = cache_K + cache_KV_base_offset;
-  const auto* cache_V_base = cache_V + cache_KV_base_offset;
+  const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
+  const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
 
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
@@ -194,7 +194,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       max_qk_acc = max(qk_accs[ttt], max_qk_acc);
     }
     if (lane_idx == 0) {
-      auto* smem_base = smem + tt;
+      auto* __restrict__ smem_base = smem + tt;
 #pragma unroll n_loop_unroll
       for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
         smem_base[ttt] = qk_accs[ttt];
@@ -358,6 +358,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     }
     // elementwise convert from compute_t result to data_t out to be written
     union { data_vec_t vec; data_t arr[vec_size]; } bf_r;
+    #pragma unroll 
     for (int32_t i = 0; i < vec_size; ++i) {
       bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }

From 2b16228ed61cb338d5b328ec4c9512231f401f68 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 20:48:21 -0500
Subject: [PATCH 205/641] simplify io logic

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 38 ++++++-------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 2a82f4b36..e7efc856b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -152,12 +152,10 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   extern __shared__ __align__(16) compute_t smem[];
 
-  data_vec_t q_thread;
+  data_vec_t q_thread = 0;
   if (lane_active_for_io) {
     load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
-  } else {
-    q_thread = 0;
-  }
+  } 
   // Each block computes different B value
   compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
@@ -165,7 +163,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // Split T across wavefronts in a block, unroll loads to expose more
   // parallelism.
 
-  data_vec_t k_loads[n_loop_unroll];
+  data_vec_t k_loads[n_loop_unroll] = {};
 
   constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
   const int32_t t_max_unroll = (t_max / dtt) * dtt;
@@ -178,9 +176,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       if (lane_active_for_io) {
         load_v<data_t, data_vec_t>(
             cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      } else {
-        k_loads[ttt] = 0;
-      }
+      } 
     }
     compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
@@ -213,9 +209,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
           // load the K[b][t][h|0][:] row into registers
           load_v<data_t, data_vec_t>(
               cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-        } else {
-          k_loads[ttt] = 0;
-        }
+        } 
       }
     }
 #pragma unroll n_loop_unroll_tail
@@ -297,9 +291,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
         // load the V[b][t][h|0][:] row into registers, reusing K register storage
         load_v<data_t, data_vec_t>(
             cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      } else {
-        k_loads[ttt] = 0;
-      }
+      } 
       ps[ttt] = smem[t];
     }
 
@@ -320,9 +312,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
         if (lane_active_for_io) {
           load_v<data_t, data_vec_t>(
               cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-        } else {
-          k_loads[ttt] = 0;
-        }
+        } 
         ps[ttt] = smem[t];
       }
     }
@@ -346,14 +336,12 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   __syncthreads();
   // sum up partial D rows from other wavefronts
-  if (wavefront_idx == 0) {
+  if (wavefront_idx == 0 && lane_active_for_io) {
     union { compute_vec_t vec = 0; compute_t arr[vec_size]; } r;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      compute_vec_t partial_r = 0;
-      if (lane_active_for_io) {
-        load_v<compute_t, compute_vec_t>(
-            smem, w * threads_per_wavefront + lane_idx, &partial_r);
-      }
+      compute_vec_t partial_r;
+      load_v<compute_t, compute_vec_t>(
+          smem, w * threads_per_wavefront + lane_idx, &partial_r);
       r.vec += partial_r;
     }
     // elementwise convert from compute_t result to data_t out to be written
@@ -364,9 +352,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     }
     // write output D row
     data_t* __restrict__ o_ = O + XQO_base_offset;
-    if (lane_active_for_io) {
-      store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
-    }
+    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
   }
 }
 

From 7f2b6d19c76d9b3b91e1c2bdcf7ce66531d2c3fb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 21:07:36 -0500
Subject: [PATCH 206/641] handle m > 1

---
 .../attention/hip_fmha/attention_forward_decoder.cpp  |  9 ++++++++-
 .../attention/hip_fmha/ck_attention_forward_decoder.h | 11 +++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 52f830e58..4a71a7252 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -75,8 +75,14 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
   TORCH_CHECK(cache_K.size(3) <= D_H);
 
   auto B = XQ.size(0);
+  auto M = XQ.size(1);
   auto H = XQ.size(2);
-  dim3 blocks(B, H);
+
+  TORCH_CHECK(B <= 1024);
+  TORCH_CHECK(M <= 1024);
+  TORCH_CHECK(H <= 1024);
+
+  dim3 blocks(B, H, M);
   dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
   int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
@@ -114,6 +120,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
             reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
             seq_acc.data(),
             XQ_acc.stride(0),
+            XQ_acc.stride(1),
             XQ_acc.stride(2),
             K_acc.stride(0),
             K_acc.stride(1),
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index e7efc856b..5cd83c71f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -104,6 +104,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     scalar_t* __restrict__ O,
     const int32_t* __restrict__ seq_positions,
     const ptrdiff_t XQ_stride_0,
+    const ptrdiff_t XQ_stride_1,
     const ptrdiff_t XQ_stride_2,
     const ptrdiff_t K_stride_0,
     const ptrdiff_t K_stride_1,
@@ -118,6 +119,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // Each block handles a single batch and head
   const int32_t b = blockIdx.x;
   const int32_t h = blockIdx.y;
+  const int32_t m = blockIdx.z;
 
   // Note: this is decoding case where we attend to current and all previous
   // tokens.
@@ -131,9 +133,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       threads_per_wavefront * wavefronts_per_block;
   const int32_t thread_linear_idx =
       lane_idx + wavefront_idx * threads_per_wavefront;
-  const bool lane_active_for_io = lane_idx * vec_size < D_H;
   // const auto* q_ = &(XQ_acc[b][0][h][0]);
-  const auto XQO_base_offset = b * XQ_stride_0 + h * XQ_stride_2;
+  const auto XQO_base_offset = b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
   const auto* __restrict__ q_ = XQ + XQO_base_offset;
 
   const auto cache_KV_base_offset =
@@ -150,6 +151,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   using compute_t = float;
   using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
 
+  const bool lane_active_for_io = lane_idx * vec_size < D_H;
+
   extern __shared__ __align__(16) compute_t smem[];
 
   data_vec_t q_thread = 0;
@@ -371,6 +374,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     scalar_t* __restrict__ O;
     const int32_t* __restrict__ seq_positions;
     const ptrdiff_t XQ_stride_0;
+    const ptrdiff_t XQ_stride_1;
     const ptrdiff_t XQ_stride_2;
     const ptrdiff_t K_stride_0;
     const ptrdiff_t K_stride_1;
@@ -390,6 +394,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
         scalar_t* __restrict__ O,
         const int32_t* __restrict__ seq_positions,
         const ptrdiff_t XQ_stride_0,
+        const ptrdiff_t XQ_stride_1,
         const ptrdiff_t XQ_stride_2,
         const ptrdiff_t K_stride_0,
         const ptrdiff_t K_stride_1,
@@ -406,6 +411,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           O(O),
           seq_positions(seq_positions),
           XQ_stride_0(XQ_stride_0),
+          XQ_stride_1(XQ_stride_1),
           XQ_stride_2(XQ_stride_2),
           K_stride_0(K_stride_0),
           K_stride_1(K_stride_1),
@@ -434,6 +440,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           arg.O,
           arg.seq_positions,
           arg.XQ_stride_0,
+          arg.XQ_stride_1,
           arg.XQ_stride_2,
           arg.K_stride_0,
           arg.K_stride_1,

From 1712637474e0afb6a8b5319494945fb16beb869d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 21:41:16 -0500
Subject: [PATCH 207/641] refactor input normalization to prepare for mq>1

---
 xformers/ops/fmha/ck_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 67e475636..6e2c5a3d9 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -32,7 +32,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                 reasons.append("Inputs must be BMHK. BMK not supported")
 
             if d.query.shape[0] != 1:
-                reasons.append("One formal batch element expected")
+                reasons.append(f"One formal batch element expected; got {d.query.shape[0]}")
 
             if d.query.shape[-1] > cls.SUPPORTED_MAX_K:
                 reasons.append(f"Got head_dim={d.query.shape[-1]}; only head_dim<={cls.SUPPORTED_MAX_K} is supported for now.")
@@ -80,7 +80,7 @@ def apply(
 
         seq_positions = attn_bias.k_seqinfo.seqlen
 
-        query = inp.query[0, :, None]
+        query = inp.query.transpose(0, 1)
 
         if inp.scale is not None:
             qk_scale = inp.scale

From 3d5b5e88ca358aa846720e64b05149ef470cd256 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 22:00:38 -0500
Subject: [PATCH 208/641] fix comments about input and output being written

---
 .../csrc/attention/hip_fmha/ck_attention_forward_decoder.h    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 5cd83c71f..e50984c66 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -133,7 +133,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       threads_per_wavefront * wavefronts_per_block;
   const int32_t thread_linear_idx =
       lane_idx + wavefront_idx * threads_per_wavefront;
-  // const auto* q_ = &(XQ_acc[b][0][h][0]);
+  // const auto* q_ = &(XQ_acc[b][m][h][0]);
   const auto XQO_base_offset = b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
   const auto* __restrict__ q_ = XQ + XQO_base_offset;
 
@@ -353,7 +353,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (int32_t i = 0; i < vec_size; ++i) {
       bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }
-    // write output D row
+    // write output row O[b][m][h][:]
     data_t* __restrict__ o_ = O + XQO_base_offset;
     store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
   }

From f333a72cfce31d45c18df17f6e1f30567ff0cc2a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 00:01:02 -0500
Subject: [PATCH 209/641] support mq>1; tested locally; small (<5) percentage
 of outputs are out of margin of error for some tests

---
 tests/test_mem_eff_attention_ck.py | 8 ++++----
 xformers/ops/fmha/ck_decoder.py    | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index f073bb76f..9d6ec70fb 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1631,12 +1631,12 @@ def test_decoder(
     dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
     torch.manual_seed(1)
     d = 256
+    num_queries = 1
     k_shape = (1, bsz * padding, n_heads, d)
-    # TODO: support 2 kv heads etc.
     k = torch.randn(k_shape, dtype=dtype_).cuda()
-    k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
+    k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
     v = torch.randn(k_shape, dtype=dtype_).cuda()
-    q = torch.randn((1, bsz, n_heads, d), dtype=dtype_).cuda()
+    q = torch.randn((1, bsz * num_queries, n_heads, d), dtype=dtype_).cuda()
     causal_diagonal = torch.tensor(  # TODO: make unnecessary
         [i - 1 for i in k_seqlen], dtype=torch.int32
     ).cuda()
@@ -1646,7 +1646,7 @@ def test_decoder(
         v = v[:, :, :1].expand(k_shape)
 
     attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[1] * bsz,
+        q_seqlen=[num_queries] * bsz,
         kv_seqlen=k_seqlen,
         causal_diagonal=causal_diagonal,
         kv_padding=padding,
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 6e2c5a3d9..a94f26e68 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -47,9 +47,10 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                 reasons.append("expect values to have last dim contiguous")
 
             q_starts = attn_bias.q_seqinfo.seqstart_py
-            if attn_bias.q_seqinfo.max_seqlen != 1:
-                reasons.append("decoding expects one query")
-            elif d.query.shape[1] != len(q_starts) - 1:
+            padding = attn_bias.k_seqinfo.padding
+            bsz = d.key.shape[1] // padding
+            num_queries = d.query.shape[1] // bsz
+            if bsz != len(q_starts) - 1:
                 reasons.append("empty lanes not supported yet")
 
             if attn_bias.k_seqinfo.padding > 8192:
@@ -80,7 +81,7 @@ def apply(
 
         seq_positions = attn_bias.k_seqinfo.seqlen
 
-        query = inp.query.transpose(0, 1)
+        query = inp.query[0].unflatten(0, (key.shape[0], -1))
 
         if inp.scale is not None:
             qk_scale = inp.scale

From 9039aa9db92acdd47c59a3a9cf31d46b7ab538bd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 00:06:21 -0500
Subject: [PATCH 210/641] fix in the comment about which blocks handle which
 part of the input

---
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index e50984c66..d3338e277 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -116,7 +116,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   constexpr int32_t seq_positions_shift = 0;
 
-  // Each block handles a single batch and head
+  // Each block handles a single batch and head and query
   const int32_t b = blockIdx.x;
   const int32_t h = blockIdx.y;
   const int32_t m = blockIdx.z;

From 02a7df2be1fa803183a4b11688b0c231d70edaab Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 12:05:25 -0500
Subject: [PATCH 211/641] {exp,max}->ck::matth::{exp,max}

---
 .../hip_fmha/ck_attention_forward_decoder.h         | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index d3338e277..925261b9e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -5,6 +5,7 @@
 #include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/inner_product.hpp>
+#include <ck/utility/math.hpp>
 
 namespace ck {
 template <>
@@ -190,7 +191,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
       qk_accs[ttt] =
           wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
-      max_qk_acc = max(qk_accs[ttt], max_qk_acc);
+      max_qk_acc = ck::math::max(qk_accs[ttt], max_qk_acc);
     }
     if (lane_idx == 0) {
       auto* __restrict__ smem_base = smem + tt;
@@ -226,7 +227,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
         qk_acc =
             wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-        max_qk_acc = max(qk_acc, max_qk_acc);
+        max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
 
         // write accumulated sums to smem.
         if (lane_idx == 0) {
@@ -243,7 +244,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   }
   __syncthreads();
   if (lane_idx < wavefronts_per_block) {
-    max_qk_acc = max(max_qk_acc, smem[T_MAX + lane_idx]);
+    max_qk_acc = ck::math::max(max_qk_acc, smem[T_MAX + lane_idx]);
   }
   // shared across all threads in block
   max_qk_acc = wavefrontReduce(
@@ -252,7 +253,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // each wavefront computes partial sum of exp.
   compute_t softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    softmax_denominator += __expf(smem[t] - max_qk_acc);
+    softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
   }
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](auto a, auto b) { return a + b; });
@@ -274,12 +275,10 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const compute_t softmax_scale_factor = 1. / softmax_denominator;
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    smem[t] = __expf(smem[t] - max_qk_acc) * softmax_scale_factor;
+    smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
   }
   __syncthreads();
 
-  // Now, we can compute the softmax and write the outputs.
-
   // Split T across wavefronts in a block
   // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
   // outputs are of size float[D]

From f260f15671fc6ff9c802965513b62f21afb144ea Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 12:10:51 -0500
Subject: [PATCH 212/641] seq_{positions}->{kv_lens}

---
 .../csrc/attention/hip_fmha/CMakeLists.txt    | 68 +++++++++++++++++++
 .../hip_fmha/attention_forward_decoder.cpp    | 14 ++--
 .../hip_fmha/ck_attention_forward_decoder.h   | 14 ++--
 3 files changed, 81 insertions(+), 15 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/CMakeLists.txt

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
new file mode 100644
index 000000000..8e8c24e0b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -0,0 +1,68 @@
+cmake_minimum_required(VERSION 3.26)
+
+project(FMHA-Decoder-Main)
+
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(project_root_dir /xformers)
+set(xformers_csrc ${project_root_dir}/xformers/csrc)
+set(sources ${xformers_csrc}/attention/hip_fmha/attention_forward_decoder.hip)
+
+set(ck_include ${project_root_dir}/third_party/composable_kernel/include/ck)
+set(torch_include /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include)
+
+set(CMAKE_CXX_COMPILER /opt/rocm/hip/bin/hipcc)
+set(CMAKE_CXX_LINK_EXECUTABLE /opt/rocm/hip/bin/hipcc)
+
+add_executable(attention_forward_decoder_main ${sources})
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+
+find_package(HIP REQUIRED)
+
+message("HIP_VERSION: ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
+
+set_target_properties(attention_forward_decoder_main PROPERTIES LINKER_LANGUAGE CXX)
+
+target_compile_options(attention_forward_decoder_main PUBLIC 
+  -fPIC 
+  -O3 
+  --offload-arch=gfx90a
+  -fno-gpu-rdc)
+
+target_include_directories(attention_forward_decoder_main PUBLIC 
+  ${xformers_csrc} 
+  ${xformers_csrc}/attention/hip_fmha
+  ${ck_include}/tensor_operation/gpu/device
+  ${ck_include}/tensor_operation/gpu/device/impl
+  ${ck_include}/tensor_operation/gpu/element
+  ${torch_include}
+  ${torch_include}/torch/csrc/api/include
+  ${torch_include}/TH
+  ${torch_include}/THC
+  ${torch_include}/THH
+)
+
+target_link_directories(attention_forward_decoder_main PUBLIC
+  /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
+  /opt/conda/envs/py_3.8/lib
+  /opt/rocm/lib
+  /opt/rocm/hip/lib
+)
+
+target_link_libraries(attention_forward_decoder_main PUBLIC
+  c10
+  c10_hip
+  torch
+  torch_python
+  torch_hip
+  torch_cpu
+  python3.8
+  amdhip64
+)
+
+target_compile_definitions(attention_forward_decoder_main PUBLIC 
+  ATTN_FWD_DECODER_MAIN
+)
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 4a71a7252..6076b5022 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -58,7 +58,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_positions, // [B]
+    const at::Tensor& seq_kv_lens, // [B]
     double qk_scale,
     at::Tensor& O) {
   static_assert(4 * ThreadsPerWavefront == D_H, "");
@@ -69,7 +69,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
   TORCH_CHECK(cache_K.is_cuda());
   TORCH_CHECK(cache_V.is_cuda());
 
-  TORCH_CHECK(seq_positions.is_cuda());
+  TORCH_CHECK(seq_kv_lens.is_cuda());
 
   TORCH_CHECK(cache_K.size(1) <= T_MAX);
   TORCH_CHECK(cache_K.size(3) <= D_H);
@@ -111,7 +111,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
             cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
         auto O_acc = O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
         auto seq_acc =
-            seq_positions
+            seq_kv_lens
                 .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
         auto arg = device_op_t::Argument(
             reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
@@ -147,12 +147,12 @@ at::Tensor efficient_attention_forward_decoder_ck_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_positions, // [B]
+    const at::Tensor& seq_kv_lens, // [B]
     double qk_scale) {
   auto O = at::empty_like(XQ);
   efficient_attention_forward_decoder_ck_out_impl<
       ThreadsPerWavefront,
-      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_positions, qk_scale, O);
+      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, O);
   return O;
 }
 
@@ -160,11 +160,11 @@ at::Tensor efficient_attention_forward_decoder_ck(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_positions, // [B]
+    const at::Tensor& seq_kv_lens, // [B]
     double qk_scale) {
   return efficient_attention_forward_decoder_ck_impl<
       kThreadsPerWavefront,
-      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_positions, qk_scale);
+      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
 }
 } // namespace
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 925261b9e..5434b2101 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -103,7 +103,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     const scalar_t* __restrict__ cache_K,
     const scalar_t* __restrict__ cache_V,
     scalar_t* __restrict__ O,
-    const int32_t* __restrict__ seq_positions,
+    const int32_t* __restrict__ seq_kv_lens,
     const ptrdiff_t XQ_stride_0,
     const ptrdiff_t XQ_stride_1,
     const ptrdiff_t XQ_stride_2,
@@ -115,8 +115,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     const float qk_scale) {
   static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
-  constexpr int32_t seq_positions_shift = 0;
-
   // Each block handles a single batch and head and query
   const int32_t b = blockIdx.x;
   const int32_t h = blockIdx.y;
@@ -124,7 +122,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   // Note: this is decoding case where we attend to current and all previous
   // tokens.
-  const int32_t t_max = seq_positions[b] + seq_positions_shift;
+  const int32_t t_max = seq_kv_lens[b];
 
   const int32_t lane_idx = threadIdx.x;
   const int32_t wavefront_idx = threadIdx.y;
@@ -371,7 +369,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     const scalar_t* __restrict__ cache_K;
     const scalar_t* __restrict__ cache_V;
     scalar_t* __restrict__ O;
-    const int32_t* __restrict__ seq_positions;
+    const int32_t* __restrict__ seq_kv_lens;
     const ptrdiff_t XQ_stride_0;
     const ptrdiff_t XQ_stride_1;
     const ptrdiff_t XQ_stride_2;
@@ -391,7 +389,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
         const scalar_t* __restrict__ cache_K,
         const scalar_t* __restrict__ cache_V,
         scalar_t* __restrict__ O,
-        const int32_t* __restrict__ seq_positions,
+        const int32_t* __restrict__ seq_kv_lens,
         const ptrdiff_t XQ_stride_0,
         const ptrdiff_t XQ_stride_1,
         const ptrdiff_t XQ_stride_2,
@@ -408,7 +406,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           cache_K(cache_K),
           cache_V(cache_V),
           O(O),
-          seq_positions(seq_positions),
+          seq_kv_lens(seq_kv_lens),
           XQ_stride_0(XQ_stride_0),
           XQ_stride_1(XQ_stride_1),
           XQ_stride_2(XQ_stride_2),
@@ -437,7 +435,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           arg.cache_K,
           arg.cache_V,
           arg.O,
-          arg.seq_positions,
+          arg.seq_kv_lens,
           arg.XQ_stride_0,
           arg.XQ_stride_1,
           arg.XQ_stride_2,

From 49853b93eabc42a6c0e256d37d50c790f78500b6 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 12:28:48 -0500
Subject: [PATCH 213/641] remove extra syncthreads; reads and writes are from
 different smem regions

---
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 5434b2101..f0edac288 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -256,7 +256,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](auto a, auto b) { return a + b; });
 
-  __syncthreads();
   if (lane_idx == 0) {
     smem[T_MAX + wavefront_idx] = softmax_denominator;
   }

From 746b970ea3a20e899182ceaeaa75f9c932869ed3 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 13:54:11 -0500
Subject: [PATCH 214/641] make vec_size the kernel template parameter

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 52 +++++++------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index f0edac288..845dbeaac 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -32,39 +32,25 @@ __device__ void inner_product<bhalf4_t, bhalf4_t, float>(
 
 namespace {
 
-template <typename data4_t>
-__device__ ck::float4_t scalar4_scale_acc(ck::float4_t acc, data4_t a, float b);
+template <typename data_t, int32_t vec_size>
+__device__ 
+typename ck::vector_type<float, vec_size>::type
+scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc, 
+                 typename ck::vector_type<data_t, vec_size>::type a, 
+                 float b) {
+  
+  union { decltype(acc) vec; float arr[vec_size]; } acc_u;
+  union { decltype(a) vec; data_t arr[vec_size]; } a_u;
 
-template <>
-__device__ ck::float4_t scalar4_scale_acc<ck::float4_t>(
-    ck::float4_t acc,
-    ck::float4_t a,
-    float b) {
-  return acc + a * b;
-}
+  acc_u.vec = acc;
+  a_u.vec = a;
 
-template <>
-__device__ ck::float4_t scalar4_scale_acc<ck::half4_t>(
-    ck::float4_t acc,
-    ck::half4_t a,
-    float b) {
-  acc.x += ck::type_convert<float>(a.x) * b;
-  acc.y += ck::type_convert<float>(a.y) * b;
-  acc.z += ck::type_convert<float>(a.z) * b;
-  acc.w += ck::type_convert<float>(a.w) * b;
-  return acc;
-}
+  #pragma unroll
+  for (int32_t i = 0; i < vec_size; ++i) {
+    acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
+  }
 
-template <>
-__device__ ck::float4_t scalar4_scale_acc<ck::bhalf4_t>(
-    ck::float4_t acc,
-    ck::bhalf4_t a,
-    float b) {
-  acc.x += ck::type_convert<float>(a.x) * b;
-  acc.y += ck::type_convert<float>(a.y) * b;
-  acc.z += ck::type_convert<float>(a.z) * b;
-  acc.w += ck::type_convert<float>(a.w) * b;
-  return acc;
+  return acc_u.vec;
 }
 
 template <typename F, int32_t n_threads_per_wavefront = 64>
@@ -94,6 +80,7 @@ __forceinline__ __device__ void store_v(
 
 template <
     typename scalar_t,
+    int32_t vec_size = 4,
     int32_t n_loop_unroll = 16,
     int32_t n_loop_unroll_tail = 2,
     int32_t T_MAX = 8192,
@@ -144,7 +131,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // Load Q into registers in all wavefronts.
   // Each thread handles 4 D dimensions
   
-  constexpr int32_t vec_size = 4;
   using data_t = scalar_t;
   using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
   using compute_t = float;
@@ -296,7 +282,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
 #pragma unroll n_loop_unroll
     for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      o_acc = scalar4_scale_acc<data_vec_t>(o_acc, k_loads[ttt], ps[ttt]);
+      o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
     }
   }
 
@@ -320,7 +306,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
       const int32_t t = tt + ttt;
       if (t < t_max) {
-        o_acc = scalar4_scale_acc<data_vec_t>(o_acc, k_loads[ttt], ps[ttt]);
+        o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
       }
     }
   }

From b5d8311dcbbb1f104f523a46e5fcfaa980df2ea5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:47:05 -0500
Subject: [PATCH 215/641] support vec_size=1,2,4

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 46 ++++++++++++++++++-
 xformers/ops/fmha/ck_decoder.py               | 14 +++++-
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 845dbeaac..c550b0e98 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -16,6 +16,27 @@ __device__ void inner_product<bhalf_t, bhalf_t, float>(
   inner_product(type_convert<float>(a), type_convert<float>(b), c);
 }
 
+template<>
+__device__ void inner_product<half_t, half_t, float>(
+    const half_t& a,
+    const half_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+__device__ void inner_product<bhalf2_t, bhalf2_t, float>(
+    const bhalf2_t& a,
+    const bhalf2_t& b,
+    float& c) {
+  const vector_type<bhalf_t, 2> a_vector{a};
+  const vector_type<bhalf_t, 2> b_vector{b};
+  ck::static_for<0, 2, 1>{}([&](auto i) {
+    inner_product(
+        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+  });
+}
+
 template <>
 __device__ void inner_product<bhalf4_t, bhalf4_t, float>(
     const bhalf4_t& a,
@@ -405,14 +426,37 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           block_dim(block_dim),
           lds_bytes(lds_bytes) {}
   };
+
   struct Invoker : public BaseInvoker {
     using Argument = DeviceOp::Argument;
     float Run(
         const Argument& arg,
         const StreamConfig& stream_config = StreamConfig{}) {
+
+      auto threads_per_wavefront = arg.block_dim.x;
+
+      auto D_H_alignment_necessary = 0;
+
+      for (auto vec_size: {4, 2, 1}) {
+        if (arg.D_H <= vec_size * threads_per_wavefront) {
+          D_H_alignment_necessary = vec_size;
+        }
+      }
+
+      if (!D_H_alignment_necessary) {
+        throw std::runtime_error("Unsupported D_H");
+      }
+
+      if (arg.D_H % D_H_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for D_H");
+      }
+
       return launch_and_time_kernel(
           stream_config,
-          efficient_attention_forward_decoder_ck_kernel<scalar_t>,
+          D_H_alignment_necessary == 4 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4> 
+        : D_H_alignment_necessary == 2 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+        : D_H_alignment_necessary == 1 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+        : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index a94f26e68..ad131faf4 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -37,8 +37,18 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             if d.query.shape[-1] > cls.SUPPORTED_MAX_K:
                 reasons.append(f"Got head_dim={d.query.shape[-1]}; only head_dim<={cls.SUPPORTED_MAX_K} is supported for now.")
 
-            if d.query.shape[-1] % 4 != 0:
-                reasons.append(f"Got head_dim={d.query.shape[-1]}; it needs to be divisible by 4")
+            threads_per_warp = 64 # TODO: ideally query the platform here
+            required_alignment = 0
+            head_dim = d.query.shape[-1]
+            for vec_size in (4, 2, 1):
+                if head_dim <= vec_size * threads_per_warp:
+                    required_alignment = vec_size
+            
+            if not required_alignment:
+                reasons.append(f"Got head_dim={head_dim} which is too large")
+            
+            if head_dim % required_alignment != 0:
+                reasons.append(f"Got head_dim={head_dim}; it needs to be divisible by {required_alignment}")
 
             if d.key.stride(-1) != 1:
                 reasons.append("expect keys to have last dim contiguous")

From 4b74097575ac714a712949bb436284e244c2062a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 15:17:56 -0500
Subject: [PATCH 216/641] simplify union init

---
 .../attention/hip_fmha/ck_attention_forward_decoder.h    | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index c550b0e98..07fb7994a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -60,12 +60,9 @@ scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc,
                  typename ck::vector_type<data_t, vec_size>::type a, 
                  float b) {
   
-  union { decltype(acc) vec; float arr[vec_size]; } acc_u;
-  union { decltype(a) vec; data_t arr[vec_size]; } a_u;
-
-  acc_u.vec = acc;
-  a_u.vec = a;
-
+  union { decltype(acc) vec; float arr[vec_size]; } acc_u {acc};
+  union { decltype(a) vec; data_t arr[vec_size]; } a_u {a};
+  
   #pragma unroll
   for (int32_t i = 0; i < vec_size; ++i) {
     acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;

From 9fd94ab42fbbdb7be0285288dddb5f1bccf692fe Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 17:13:42 -0500
Subject: [PATCH 217/641] partial fixes to cmakelists; wip

---
 .../csrc/attention/hip_fmha/CMakeLists.txt    | 53 ++++++++++++-------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index 8e8c24e0b..8f5c8c5b7 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -1,43 +1,48 @@
 cmake_minimum_required(VERSION 3.26)
 
-project(FMHA-Decoder-Main)
+project(FMHADecoderMain LANGUAGES CXX)
+
+message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER} (need hipcc)")
 
-enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
+set(exe_name attention_forward_decoder_main)
 set(project_root_dir /xformers)
 set(xformers_csrc ${project_root_dir}/xformers/csrc)
 set(sources ${xformers_csrc}/attention/hip_fmha/attention_forward_decoder.hip)
 
-set(ck_include ${project_root_dir}/third_party/composable_kernel/include/ck)
+set(ck_include ${project_root_dir}/third_party/composable_kernel/include/)
 set(torch_include /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include)
 
-set(CMAKE_CXX_COMPILER /opt/rocm/hip/bin/hipcc)
-set(CMAKE_CXX_LINK_EXECUTABLE /opt/rocm/hip/bin/hipcc)
+set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
+add_executable(${exe_name} ${sources})
 
-add_executable(attention_forward_decoder_main ${sources})
-message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+message("sources: ${sources}")
 
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
 find_package(HIP REQUIRED)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
+include(ROCMInstallTargets)
 
-message("HIP_VERSION: ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
+message("HIP_VERSION: ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_PATCH}")
 
-set_target_properties(attention_forward_decoder_main PROPERTIES LINKER_LANGUAGE CXX)
+set_target_properties(${exe_name} PROPERTIES LINKER_LANGUAGE CXX)
+set_target_properties(${exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-target_compile_options(attention_forward_decoder_main PUBLIC 
-  -fPIC 
+target_compile_options(${exe_name} PUBLIC 
   -O3 
-  --offload-arch=gfx90a
+  --offload-arch=${GPU_TARGETS}
   -fno-gpu-rdc)
 
-target_include_directories(attention_forward_decoder_main PUBLIC 
+target_include_directories(${exe_name} PUBLIC 
   ${xformers_csrc} 
   ${xformers_csrc}/attention/hip_fmha
-  ${ck_include}/tensor_operation/gpu/device
-  ${ck_include}/tensor_operation/gpu/device/impl
-  ${ck_include}/tensor_operation/gpu/element
+  ${ck_include}
+  ${ck_include}/ck/tensor_operation/gpu/device
+  ${ck_include}/ck/tensor_operation/gpu/device/impl
+  ${ck_include}/ck/tensor_operation/gpu/element
   ${torch_include}
   ${torch_include}/torch/csrc/api/include
   ${torch_include}/TH
@@ -45,14 +50,14 @@ target_include_directories(attention_forward_decoder_main PUBLIC
   ${torch_include}/THH
 )
 
-target_link_directories(attention_forward_decoder_main PUBLIC
+target_link_directories(${exe_name} PUBLIC
   /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
   /opt/conda/envs/py_3.8/lib
   /opt/rocm/lib
   /opt/rocm/hip/lib
 )
 
-target_link_libraries(attention_forward_decoder_main PUBLIC
+target_link_libraries(${exe_name} PUBLIC
   c10
   c10_hip
   torch
@@ -63,6 +68,14 @@ target_link_libraries(attention_forward_decoder_main PUBLIC
   amdhip64
 )
 
-target_compile_definitions(attention_forward_decoder_main PUBLIC 
-  ATTN_FWD_DECODER_MAIN
+target_compile_definitions(${exe_name} PUBLIC 
+  ATTN_FWD_DECODER_MAIN=1
+  GLIBCXX_USE_CXX11_ABI=1
+  __HIP_PLATFORM_HCC__=1
+  USE_ROCM=1
 )
+
+include(CMakePrintHelpers)
+cmake_print_properties(TARGETS ${exe_name} PROPERTIES LINK_LIBRARIES LINK_DIRECTORIES INCLUDE_DIRECTORIES COMPILE_DEFINITIONS COMPILE_OPTIONS)
+
+rocm_install(TARGETS ${exe_name})
\ No newline at end of file

From dc93fa03a5d30ad2c9996d8928ec7e9a1f7e6c15 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:02:19 -0500
Subject: [PATCH 218/641] enable building standalone exe with cmake

---
 .../csrc/attention/hip_fmha/CMakeLists.txt    | 32 ++++-----
 .../hip_fmha/attention_forward_decoder.cpp    | 68 +++----------------
 2 files changed, 26 insertions(+), 74 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index 8f5c8c5b7..29ad562f8 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.26)
 
-project(FMHADecoderMain LANGUAGES CXX)
+project(FMHADecoderMain LANGUAGES CXX HIP)
 
 message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER} (need hipcc)")
 
@@ -16,12 +16,9 @@ set(sources ${xformers_csrc}/attention/hip_fmha/attention_forward_decoder.hip)
 set(ck_include ${project_root_dir}/third_party/composable_kernel/include/)
 set(torch_include /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include)
 
-set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
+set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
 add_executable(${exe_name} ${sources})
 
-message("sources: ${sources}")
-
-message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
 find_package(HIP REQUIRED)
 find_package(ROCM REQUIRED PATHS /opt/rocm)
 include(ROCMInstallTargets)
@@ -30,29 +27,27 @@ message("HIP_VERSION: ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_PA
 
 set_target_properties(${exe_name} PROPERTIES LINKER_LANGUAGE CXX)
 set_target_properties(${exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(${exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
 
 target_compile_options(${exe_name} PUBLIC 
   -O3 
   --offload-arch=${GPU_TARGETS}
-  -fno-gpu-rdc)
+  -fno-gpu-rdc
+  $<$<CONFIG:Debug>:
+  --save-temps
+  >
+)
 
 target_include_directories(${exe_name} PUBLIC 
   ${xformers_csrc} 
   ${xformers_csrc}/attention/hip_fmha
   ${ck_include}
-  ${ck_include}/ck/tensor_operation/gpu/device
-  ${ck_include}/ck/tensor_operation/gpu/device/impl
-  ${ck_include}/ck/tensor_operation/gpu/element
   ${torch_include}
   ${torch_include}/torch/csrc/api/include
-  ${torch_include}/TH
-  ${torch_include}/THC
-  ${torch_include}/THH
 )
 
 target_link_directories(${exe_name} PUBLIC
   /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
-  /opt/conda/envs/py_3.8/lib
   /opt/rocm/lib
   /opt/rocm/hip/lib
 )
@@ -61,10 +56,8 @@ target_link_libraries(${exe_name} PUBLIC
   c10
   c10_hip
   torch
-  torch_python
   torch_hip
   torch_cpu
-  python3.8
   amdhip64
 )
 
@@ -76,6 +69,13 @@ target_compile_definitions(${exe_name} PUBLIC
 )
 
 include(CMakePrintHelpers)
-cmake_print_properties(TARGETS ${exe_name} PROPERTIES LINK_LIBRARIES LINK_DIRECTORIES INCLUDE_DIRECTORIES COMPILE_DEFINITIONS COMPILE_OPTIONS)
+cmake_print_properties(TARGETS ${exe_name} PROPERTIES 
+  LINK_LIBRARIES 
+  LINK_DIRECTORIES 
+  INCLUDE_DIRECTORIES 
+  COMPILE_DEFINITIONS 
+  COMPILE_OPTIONS
+  SOURCES
+  HIP_ARCHITECTURES)
 
 rocm_install(TARGETS ${exe_name})
\ No newline at end of file
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 6076b5022..79fb68368 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -2,7 +2,6 @@
   TODO: license header
 */
 
-// #include <ck/ck.hpp>
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
@@ -189,67 +188,20 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
  For efficient utilization of CPU cores for compilation use MAX_JOBS env variable.
 
 (2) compile
- > /opt/rocm/bin/hipcc \
--I/xformers/xformers/csrc \
--I/xformers/xformers/csrc/attention/hip_fmha \
--I/xformers/third_party/composable_kernel/include \
--I/xformers/third_party/composable_kernel/include/ck \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/device/impl \
--I/xformers/third_party/composable_kernel/include/ck/tensor_operation/gpu/element \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/torch/csrc/api/include \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/TH \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THC \
--I/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include/THH \
--I/opt/rocm/include \
--I/opt/conda/envs/py_3.8/include/python3.8 \
--L/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
--L/opt/conda/envs/py_3.8/lib \
--L/opt/rocm/lib \
--L/opt/rocm/hip/lib \
--fPIC \
--D__HIP_PLATFORM_HCC__=1 \
--DATTN_FWD_DECODER_MAIN \
--DUSE_ROCM=1 \
--DCUDA_HAS_FP16=1 \
--D__HIP_NO_HALF_OPERATORS__=1 \
--D__HIP_NO_HALF_CONVERSIONS__=1 \
--O3 \
--std=c++17 \
---offload-arch=gfx90a \
--U__CUDA_NO_HALF_OPERATORS__ \
--U__CUDA_NO_HALF_CONVERSIONS__ \
--DBUILD_PYTHON_PACKAGE \
--DTORCH_API_INCLUDE_EXTENSION_H \
-'-DPYBIND11_COMPILER_TYPE="_gcc"' \
-'-DPYBIND11_STDLIB="_libstdcpp"' \
-'-DPYBIND11_BUILD_ABI="_cxxabi1013"' \
--DTORCH_EXTENSION_NAME=_C \
--D_GLIBCXX_USE_CXX11_ABI=1 \
--fno-gpu-rdc \
-/xformers/xformers/csrc/attention/hip_fmha/attention_forward_decoder.hip \
--lc10_hip \
--ltorch_hip \
--lc10 \
--ltorch \
--ltorch_cpu \
--ltorch_python \
--lpython3.8 \
--lamdhip64 \
--o a.out
-
-For assembly debugging, add `--save-temps -g`.
+ > mkdir build
+ > cd build
+ > cmake /xformers/xformers/csrc/attention/hip_fmha/ \ 
+       -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+       -D CMAKE_PREFIX_PATH=/opt/rocm \
+       -D CMAKE_BUILD_TYPE=Debug \
+       -D GPU_TARGETS="gfx90a" 
+  > make
 
 (3a) run correctness check
- >
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
- ./a.out
+ > ./attention_forward_decoder_main
 
 (3b) run specific input shape
- >
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib \
- ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block
+ > ./attention_forward_decoder_main n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block
 */
 
 // clang-format on

From 64da2b9dc0ce37a312012188e8996ff219fd9b6a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:43:56 -0500
Subject: [PATCH 219/641] cleanup includes and libraries for standalone exe

---
 xformers/csrc/attention/hip_fmha/CMakeLists.txt | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index 29ad562f8..d0282cfb9 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -31,7 +31,6 @@ set_target_properties(${exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
 
 target_compile_options(${exe_name} PUBLIC 
   -O3 
-  --offload-arch=${GPU_TARGETS}
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps
@@ -39,16 +38,13 @@ target_compile_options(${exe_name} PUBLIC
 )
 
 target_include_directories(${exe_name} PUBLIC 
-  ${xformers_csrc} 
-  ${xformers_csrc}/attention/hip_fmha
-  ${ck_include}
-  ${torch_include}
-  ${torch_include}/torch/csrc/api/include
+  ${ck_include}                           # ck includes
+  ${torch_include}                        # aten includes
+  ${torch_include}/torch/csrc/api/include # torch includes
 )
 
 target_link_directories(${exe_name} PUBLIC
-  /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib
-  /opt/rocm/lib
+  /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib # c10, torch
   /opt/rocm/hip/lib
 )
 

From 684e5e0ec5d2c0e5351b6d6fb92b4a5a4939d056 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:54:13 -0500
Subject: [PATCH 220/641] remove unnecessary -O3 in cmakelists

---
 xformers/csrc/attention/hip_fmha/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index d0282cfb9..a95c68fbe 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -30,7 +30,6 @@ set_target_properties(${exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
 
 target_compile_options(${exe_name} PUBLIC 
-  -O3 
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps

From 2e79bc9e0f4970ceb52bb63dd5da7c79b802bc41 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 13:00:08 -0500
Subject: [PATCH 221/641] use d=128 dtype=bf16 in the benchmark

---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index 460279c7f..bfbe4c35b 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -118,8 +118,8 @@ def mem_eff_attention_decoder(
     n_keys, padding, B = kv_shape
     torch.manual_seed(42)
     k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
-    K = 256
-    dtype = torch.float16
+    K = 128
+    dtype = torch.bfloat16
     q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
     if multiquery:
         k = torch.rand(

From 09829a4e2b8a2eb68aebe2e812e811c59bbcc74a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 13:25:27 -0500
Subject: [PATCH 222/641] update comment to reflect that vec_size is variable

---
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 07fb7994a..68dfe6162 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -147,7 +147,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
 
   // Load Q into registers in all wavefronts.
-  // Each thread handles 4 D dimensions
+  // Each thread handles `vec_size` D dimensions
   
   using data_t = scalar_t;
   using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;

From becbbad2f51126ddcf4f3f9f588878584ee4589d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 13:50:10 -0500
Subject: [PATCH 223/641] move active lane condition one loop level up for ~5%
 perf gain

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 48 ++++++++++---------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 68dfe6162..53d09c83b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -175,11 +175,11 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const int32_t t_max_unroll = (t_max / dtt) * dtt;
 
   for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      const int32_t t = tt + ttt;
-      // load the K[b][t][h|0][:] row into registers
-      if (lane_active_for_io) {
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the K[b][t][h|0][:] row into registers
         load_v<data_t, data_vec_t>(
             cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
       } 
@@ -207,11 +207,12 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
   for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
        tt += wavefronts_per_block * n_loop_unroll_tail) {
+        
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        if (lane_active_for_io) {
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
           // load the K[b][t][h|0][:] row into registers
           load_v<data_t, data_vec_t>(
               cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
@@ -284,18 +285,18 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
   // outputs are of size float[D]
 
-  compute_t ps[n_loop_unroll];
+  compute_t ps[n_loop_unroll] = {};
   compute_vec_t o_acc = 0;
   for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (lane_active_for_io) {
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
         // load the V[b][t][h|0][:] row into registers, reusing K register storage
         load_v<data_t, data_vec_t>(
-            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      } 
-      ps[ttt] = smem[t];
+            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]); 
+        ps[ttt] = smem[t];
+      }
     }
 
 #pragma unroll n_loop_unroll
@@ -306,17 +307,18 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
        tt += wavefronts_per_block * n_loop_unroll_tail) {
+        
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        // load the V[b][t][h|0][:] row into registers, reusing K register
-        // storage
-        if (lane_active_for_io) {
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the V[b][t][h|0][:] row into registers, reusing K register
+          // storage
           load_v<data_t, data_vec_t>(
               cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-        } 
-        ps[ttt] = smem[t];
+          ps[ttt] = smem[t];
+        }
       }
     }
 

From fcf9817e3fc2ab035c0110d1541b0c26b624d7de Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 14:20:53 -0500
Subject: [PATCH 224/641] move active lane condition one more loop level up in
 SV calculation, a bit more perf gain + clang-format

---
 .../hip_fmha/ck_attention_forward_decoder.h   | 113 ++++++++++--------
 1 file changed, 63 insertions(+), 50 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 53d09c83b..ef68559eb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -1,4 +1,4 @@
-#pragma once 
+#pragma once
 
 #include <ck/host_utility/kernel_launch_hip.hpp>
 #include <ck/stream_config.hpp>
@@ -16,7 +16,7 @@ __device__ void inner_product<bhalf_t, bhalf_t, float>(
   inner_product(type_convert<float>(a), type_convert<float>(b), c);
 }
 
-template<>
+template <>
 __device__ void inner_product<half_t, half_t, float>(
     const half_t& a,
     const half_t& b,
@@ -54,16 +54,20 @@ __device__ void inner_product<bhalf4_t, bhalf4_t, float>(
 namespace {
 
 template <typename data_t, int32_t vec_size>
-__device__ 
-typename ck::vector_type<float, vec_size>::type
-scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc, 
-                 typename ck::vector_type<data_t, vec_size>::type a, 
-                 float b) {
-  
-  union { decltype(acc) vec; float arr[vec_size]; } acc_u {acc};
-  union { decltype(a) vec; data_t arr[vec_size]; } a_u {a};
-  
-  #pragma unroll
+__device__ typename ck::vector_type<float, vec_size>::type scalar_scale_acc(
+    typename ck::vector_type<float, vec_size>::type acc,
+    typename ck::vector_type<data_t, vec_size>::type a,
+    float b) {
+  union {
+    decltype(acc) vec;
+    float arr[vec_size];
+  } acc_u{acc};
+  union {
+    decltype(a) vec;
+    data_t arr[vec_size];
+  } a_u{a};
+
+#pragma unroll
   for (int32_t i = 0; i < vec_size; ++i) {
     acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
   }
@@ -85,7 +89,7 @@ __forceinline__ __device__ void load_v(
     const TData* __restrict__ data_ptr,
     int32_t vector_offset,
     TDataVec* __restrict__ load_to) {
-      *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TData, typename TDataVec>
@@ -93,7 +97,7 @@ __forceinline__ __device__ void store_v(
     TData* __restrict__ data_ptr,
     int32_t vector_offset,
     TDataVec value) {
-      *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
 template <
@@ -138,7 +142,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   const int32_t thread_linear_idx =
       lane_idx + wavefront_idx * threads_per_wavefront;
   // const auto* q_ = &(XQ_acc[b][m][h][0]);
-  const auto XQO_base_offset = b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
+  const auto XQO_base_offset =
+      b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
   const auto* __restrict__ q_ = XQ + XQO_base_offset;
 
   const auto cache_KV_base_offset =
@@ -148,7 +153,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   // Load Q into registers in all wavefronts.
   // Each thread handles `vec_size` D dimensions
-  
+
   using data_t = scalar_t;
   using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
   using compute_t = float;
@@ -161,7 +166,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   data_vec_t q_thread = 0;
   if (lane_active_for_io) {
     load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
-  } 
+  }
   // Each block computes different B value
   compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
@@ -182,7 +187,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
         // load the K[b][t][h|0][:] row into registers
         load_v<data_t, data_vec_t>(
             cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      } 
+      }
     }
     compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
@@ -207,7 +212,6 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
   for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
        tt += wavefronts_per_block * n_loop_unroll_tail) {
-        
     if (lane_active_for_io) {
 #pragma unroll n_loop_unroll_tail
       for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
@@ -216,7 +220,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
           // load the K[b][t][h|0][:] row into registers
           load_v<data_t, data_vec_t>(
               cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-        } 
+        }
       }
     }
 #pragma unroll n_loop_unroll_tail
@@ -228,8 +232,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
             q_thread, k_loads[ttt], qk_acc);
         qk_acc *= qk_scale;
 
-        qk_acc =
-            wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+        qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
         max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
 
         // write accumulated sums to smem.
@@ -250,8 +253,8 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     max_qk_acc = ck::math::max(max_qk_acc, smem[T_MAX + lane_idx]);
   }
   // shared across all threads in block
-  max_qk_acc = wavefrontReduce(
-      max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+  max_qk_acc =
+      wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
 
   // each wavefront computes partial sum of exp.
   compute_t softmax_denominator = 0.0f;
@@ -287,28 +290,29 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   compute_t ps[n_loop_unroll] = {};
   compute_vec_t o_acc = 0;
-  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
-    if (lane_active_for_io) {
+  if (lane_active_for_io) {
+    for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll;
+         tt += dtt) {
 #pragma unroll n_loop_unroll
       for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
         const int32_t t = tt + ttt;
-        // load the V[b][t][h|0][:] row into registers, reusing K register storage
+        // load the V[b][t][h|0][:] row into registers, reusing K register
+        // storage
         load_v<data_t, data_vec_t>(
-            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]); 
+            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
         ps[ttt] = smem[t];
       }
-    }
 
 #pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        o_acc =
+            scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+      }
     }
-  }
 
-  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-       tt += wavefronts_per_block * n_loop_unroll_tail) {
-        
-    if (lane_active_for_io) {
+    for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail;
+         tt < t_max;
+         tt += wavefronts_per_block * n_loop_unroll_tail) {
 #pragma unroll n_loop_unroll_tail
       for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
         const int32_t t = tt + ttt;
@@ -320,13 +324,14 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
           ps[ttt] = smem[t];
         }
       }
-    }
 
 #pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          o_acc =
+              scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+        }
       }
     }
   }
@@ -342,7 +347,10 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   __syncthreads();
   // sum up partial D rows from other wavefronts
   if (wavefront_idx == 0 && lane_active_for_io) {
-    union { compute_vec_t vec = 0; compute_t arr[vec_size]; } r;
+    union {
+      compute_vec_t vec = 0;
+      compute_t arr[vec_size];
+    } r;
     for (int32_t w = 0; w < wavefronts_per_block; ++w) {
       compute_vec_t partial_r;
       load_v<compute_t, compute_vec_t>(
@@ -350,8 +358,11 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
       r.vec += partial_r;
     }
     // elementwise convert from compute_t result to data_t out to be written
-    union { data_vec_t vec; data_t arr[vec_size]; } bf_r;
-    #pragma unroll 
+    union {
+      data_vec_t vec;
+      data_t arr[vec_size];
+    } bf_r;
+#pragma unroll
     for (int32_t i = 0; i < vec_size; ++i) {
       bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }
@@ -431,12 +442,11 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     float Run(
         const Argument& arg,
         const StreamConfig& stream_config = StreamConfig{}) {
-
       auto threads_per_wavefront = arg.block_dim.x;
 
       auto D_H_alignment_necessary = 0;
 
-      for (auto vec_size: {4, 2, 1}) {
+      for (auto vec_size : {4, 2, 1}) {
         if (arg.D_H <= vec_size * threads_per_wavefront) {
           D_H_alignment_necessary = vec_size;
         }
@@ -452,10 +462,13 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
 
       return launch_and_time_kernel(
           stream_config,
-          D_H_alignment_necessary == 4 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4> 
-        : D_H_alignment_necessary == 2 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-        : D_H_alignment_necessary == 1 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-        : nullptr,
+          D_H_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+              : D_H_alignment_necessary == 2
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+              : D_H_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,

From 8ba431eaf53cfa439611bd0fbc0a052ab5ded49e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 14:33:00 -0500
Subject: [PATCH 225/641] replace one more instance of hardcoded 4 with
 vec_size in a comment

---
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index ef68559eb..052a1d808 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -339,7 +339,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
   // results back.
   __syncthreads();
 
-  // NB: needs sizeof(smem) >= 4 * (sizeof(float)==4) * threadsPerBlock
+  // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
   if (lane_active_for_io) {
     store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
   }

From bc9737ca09a69315e025960943d7cf1a66aec7df Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 17:43:53 -0500
Subject: [PATCH 226/641] unhardcode gfx arch

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 647e09620..92d0bcbad 100644
--- a/setup.py
+++ b/setup.py
@@ -311,7 +311,7 @@ def get_extensions():
                 [
                     "-O3",
                     "-std=c++17",
-                    "--offload-arch=gfx90a",
+                    f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'gfx90a')}",
                     "-U__CUDA_NO_HALF_OPERATORS__",
                     "-U__CUDA_NO_HALF_CONVERSIONS__",
                 ]

From 846188545d016cc59ac723ce95649308b6d6d72e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 17 Nov 2023 18:19:40 -0500
Subject: [PATCH 227/641] use native gfx arch by default

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 92d0bcbad..41922e8a6 100644
--- a/setup.py
+++ b/setup.py
@@ -311,7 +311,7 @@ def get_extensions():
                 [
                     "-O3",
                     "-std=c++17",
-                    f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'gfx90a')}",
+                    f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
                     "-U__CUDA_NO_HALF_OPERATORS__",
                     "-U__CUDA_NO_HALF_CONVERSIONS__",
                 ]

From e7e83c806130ff4e4cf2a8046f94ec200aa94d59 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 20 Nov 2023 19:01:22 +0000
Subject: [PATCH 228/641] Add https://github.com/asroy/ck_tile.git as submodule
 for using ck-tiled kernels

---
 .gitmodules                         | 3 +++
 third_party/composable_kernel_tiled | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 third_party/composable_kernel_tiled

diff --git a/.gitmodules b/.gitmodules
index 94eb8135c..dd09e4429 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,3 +8,6 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
+[submodule "third_party/composable_kernel_tiled"]
+	path = third_party/composable_kernel_tiled
+	url = https://github.com/asroy/ck_tile.git
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
new file mode 160000
index 000000000..496be40ef
--- /dev/null
+++ b/third_party/composable_kernel_tiled
@@ -0,0 +1 @@
+Subproject commit 496be40efde65ace153fe53ec9a3865828f2d3cc

From dd3aeab01dd9133922799c1abf8f72e560ee095c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 20 Nov 2023 19:56:25 +0000
Subject: [PATCH 229/641] Create codes structure and change to setup.py to use
 ck-tiled programming for inference

---
 setup.py                                      |  42 +-
 .../attention_forward_generic_ck_tiled.cpp    | 439 ++++++++++++++++++
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  28 ++
 .../ck_tiled_fmha_batched_infer_bp16.cpp      |  58 +++
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  58 +++
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  28 ++
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      |  58 +++
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  58 +++
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |   8 +
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |   8 +
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |   8 +
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |   8 +
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |   8 +
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |   8 +
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |   8 +
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   8 +
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |   8 +
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   8 +
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |   8 +
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   8 +
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |   8 +
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |   8 +
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |   8 +
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |   8 +
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |   8 +
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |   8 +
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |   8 +
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   8 +
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |   8 +
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   8 +
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |   8 +
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   8 +
 32 files changed, 952 insertions(+), 9 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp

diff --git a/setup.py b/setup.py
index 21a99a287..c9bfb35f3 100644
--- a/setup.py
+++ b/setup.py
@@ -208,8 +208,27 @@ def get_extensions():
     source_cuda += glob.glob(os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True)
     source_cuda += glob.glob(os.path.join(extensions_dir, "indexing", "**", "*.cu"), recursive=True)
     source_cuda += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True)
-    source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cpp"), recursive=True) 
 
+    source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"), recursive=False)
+    
+    if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
+    else:
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_backward_generic.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_forward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_forward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_backward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_backward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances", "ck_fmha_*.cpp"), recursive=False)
+        
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
     cutlass_examples_dir = os.path.join(this_dir, "third_party", "cutlass", "examples")
@@ -293,16 +312,21 @@ def get_extensions():
         ]
     elif torch.cuda.is_available() and torch.version.hip: 
        rename_cpp_cu(source_hip)
-       source_hip_cu = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cu"), recursive=True) 
+       source_hip_cu = []
+       for ff in source_hip:
+           source_hip_cu += [ff.replace(".cpp", ".cu")]
+
        extension = CUDAExtension
        sources += source_hip_cu
-       include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha', 
-                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include',
-                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' ,
-                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'device',
-                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'device' / 'impl',
-                         Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck' / 'tensor_operation' / 'gpu' / 'element',
-                       ] 
+       include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha' ]
+
+       if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
+           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include',
+                             Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']
+       else:
+           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include', 
+                             Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']
+           
        generator_flag = []
        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
        extra_compile_args={
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
new file mode 100644
index 000000000..8cd17ad84
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -0,0 +1,439 @@
+#include <cmath>
+#include <mutex>
+
+#include <ATen/Context.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/Generator.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+#include <torch/library.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+
+#include "ck_fmha_params.h"
+#include "ck_fmha_util.h"
+
+/*
+extern void batched_forward_fp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void batched_forward_bp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_fp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_bp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+*/
+
+extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
+extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
+extern void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream);
+extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream);
+
+namespace {
+
+/*
+  There are 2 modes for using this function.
+  (Mode BMHK) With all the heads having the same seqlen
+  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
+*/
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
+efficient_attention_forward_ck(
+    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
+    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
+    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
+    const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& seqstart_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& seqstart_k,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
+    double dropout_p, // attention matrix dropout probability
+    bool compute_logsumexp,
+    int64_t custom_mask_type,
+    c10::optional<double> scale,
+    const c10::optional<at::Tensor>& seqlen_k) {
+  TORCH_CHECK(query.dim() == 4);
+  TORCH_CHECK(key.dim() == 4);
+  TORCH_CHECK(value.dim() == 4);
+
+  // Batch sizes
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // Sequence length
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+
+  TORCH_CHECK(query.scalar_type() == key.scalar_type());
+  TORCH_CHECK(query.scalar_type() == value.scalar_type());
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+  };
+
+  // last dim is contiguous, device is kCUDA
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t Hq = query.size(-2);
+  int64_t Hkv = key.size(-2);
+  int64_t K = query.size(-1);
+  int64_t Kv = value.size(-1);
+
+  auto opts = query.options();
+
+  at::Tensor logsumexp;
+
+  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
+
+  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  if (use_dropout) {
+    /*
+    at::PhiloxCudaState rng_engine_inputs;
+    at::CUDAGeneratorImpl* gen =
+        at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    // if using dropout, we produce 1 random number for each element of the
+    // attention tensor
+    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
+
+    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+    philox_seed = std::get<0>(seeds);
+    philox_offset = std::get<1>(seeds);
+    */
+    throw std::runtime_error(
+        "drop-out is currently not implemented by ck-tiled!");
+  }
+
+  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
+    p.compute_logsumexp = compute_logsumexp;
+
+    // the following parameters are only used by training forward
+    if (p.use_dropout) {
+      // p.dropout_prob = static_cast<float>(dropout_p);
+      throw std::runtime_error(
+          "drop-out is currently not implemented by ck-tiled!");
+    } else
+      p.dropout_prob = 0.0f;
+
+    if (p.compute_logsumexp) {
+      /*
+      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
+      p.logsumexp_ptr = logsumexp.data_ptr();
+      */
+      throw std::runtime_error(
+          "compute logsumexp is currently not implemented by ck-tiled!");
+    } else
+      p.logsumexp_ptr = nullptr;
+  };
+
+  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.custom_mask_type = custom_mask_type;
+
+    // max_seqlen_q is used to create logsumexp tensor
+    p.max_seqlen_q = *max_seqlen_q_;
+
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
+
+    for (int i = 0; i < p.host_seqstart_q.size(); i++)
+      p.host_seqstart_q[i] =
+          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
+
+    for (int i = 0; i < p.host_seqstart_k.size(); i++)
+      p.host_seqstart_k[i] =
+          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
+
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
+
+      p.host_seqlen_k.resize(p.num_batches);
+
+      for (int i = 0; i < p.host_seqlen_k.size(); i++)
+        p.host_seqlen_k[i] =
+            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
+    }
+
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* attn_bias_ptr =
+        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
+
+    for (int i = 0; i < p.num_batches; i++) {
+      size_t tmp_q_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
+          query.scalar_type());
+      size_t tmp_k_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
+          key.scalar_type());
+      size_t tmp_v_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
+          value.scalar_type());
+      size_t tmp_o_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
+          out.scalar_type());
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
+      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
+      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
+      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
+
+      if (bias.has_value()) {
+        size_t tmp_bias_offset = get_size_in_bytes(
+            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(
+            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+      };
+
+      // ToDO: remove this after dev-op fix
+      p.randvals_ptrs.push_back(nullptr);
+    }
+
+    p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
+    p.compute_logsumexp = compute_logsumexp;
+
+    // the following parameters are only used by training forward
+    if (p.use_dropout) {
+      // p.dropout_prob = static_cast<float>(dropout_p);
+      throw std::runtime_error(
+          "drop-out is currently not implemented by ck-tiled!");
+    } else
+      p.dropout_prob = 0.0f;
+
+    if (p.compute_logsumexp) {
+      /*
+      logsumexp = at::empty(
+          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+      char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+
+      for (int i = 0; i < p.num_batches; i++) {
+        size_t tmp_logsumexp_offset = get_size_in_bytes(
+            static_cast<size_t>(i) * Hq * p.max_seqlen_q,
+            logsumexp.scalar_type());
+        p.logsumexp_ptrs.push_back(
+            reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+      };
+      */
+      throw std::runtime_error(
+          "compute logsumexp is currently not implemented by ck-tiled!");
+    };
+  };
+
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedForwardParams batched_forward_params;
+
+    set_batched_forward_params(batched_forward_params);
+
+    if (!batched_forward_params.use_dropout &&
+        !batched_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        batched_infer_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_infer_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      /*
+      if (inDataType == at::ScalarType::Half) {
+        batched_forward_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_forward_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+      */
+      throw std::runtime_error(
+          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
+    };
+  } else { // input is grouped
+    GroupedForwardParams grouped_forward_params;
+
+    set_grouped_forward_params(grouped_forward_params);
+
+    if (!grouped_forward_params.use_dropout &&
+        !grouped_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_infer_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_infer_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      /*
+      if (inDataType == at::ScalarType::Half) {
+        grouped_forward_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_forward_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+      */
+      throw std::runtime_error(
+          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
+    };
+  };
+
+  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
+      TORCH_FN(efficient_attention_forward_ck));
+}
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
new file mode 100644
index 000000000..9aa37d9b8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+
+#include "ck_fmha_params.h"
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+struct batched_infer_masktype_attnbias_dispatched {
+  static void Run(BatchedForwardParams& param, hipStream_t stream){};
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(
+      BatchedForwardParams& param,
+      hipStream_t stream){};
+};
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void run_batched_infer_masktype_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  batched_infer_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
new file mode 100644
index 000000000..81ff5b915
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -0,0 +1,58 @@
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_bool_switch.h"
+#include "ck_tiled_fmha_batched_infer.h"
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
new file mode 100644
index 000000000..5814b7391
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -0,0 +1,58 @@
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_bool_switch.h"
+#include "ck_tiled_fmha_batched_infer.h"
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
new file mode 100644
index 000000000..b3d3b159b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/ck.hpp>
+
+#include "ck_fmha_params.h"
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+struct grouped_infer_masktype_attnbias_dispatched {
+  static void Run(GroupedForwardParams& param, hipStream_t stream){};
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(
+      GroupedForwardParams& param,
+      hipStream_t stream){};
+};
+
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+void run_grouped_infer_masktype_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_infer_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
new file mode 100644
index 000000000..bdfce5854
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -0,0 +1,58 @@
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_bool_switch.h"
+#include "ck_tiled_fmha_grouped_infer.h"
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
new file mode 100644
index 000000000..009571c97
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -0,0 +1,58 @@
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_bool_switch.h"
+#include "ck_tiled_fmha_grouped_infer.h"
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..9748955e1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..418f925c2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..a7cdb48b8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..578855b9b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..35e9bca9c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..e27e3b5ff
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..5c83b0abd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..11c76b35f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..b13f5a4c9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..12f5991c4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..8d45859e5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..9f03be2b5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..973213413
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..96e0ba425
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..332724e73
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..cb1120f5b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..51ed70cab
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..c157e89c1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..bbcd3ab0e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..e320f5de6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..e763dde6a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..3ec2d41da
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..dee7a0845
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..b5515e9a0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,8 @@
+#include <ck/ck.hpp>
+
+#include "ck_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);

From 5b54bf9dfcb1d46299532e49519be3dd554227a8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 20 Nov 2023 18:54:49 -0500
Subject: [PATCH 230/641] add benchmark_attn_decoding from upstream xformers;
 run ck fw op for decoding

---
 .../benchmarks/benchmark_attn_decoding.py     | 159 ++++++++++++++++++
 xformers/benchmarks/utils.py                  |  49 +++++-
 2 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 xformers/benchmarks/benchmark_attn_decoding.py

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
new file mode 100644
index 000000000..a22a4f645
--- /dev/null
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -0,0 +1,159 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Any
+
+import torch
+from torch.utils import benchmark
+from utils import benchmark_main_helper2
+
+import xformers.ops as xops
+
+min_run_time = 0.5
+device = torch.device("cuda")
+
+
+CASES = [
+    dict(B=max(1, 2 ** (16 - i)), Mq=1, Mkv=2**i, Hq=16, Hkv=1, K=128)
+    for i in range(8, 18)
+]
+# + [
+#     dict(B=max(1, 2 ** (16 - i)), Mq=1, Mkv=2**i, Hq=16, Hkv=2, K=128)
+#     for i in range(8, 18)
+# ]
+
+
+def _setup_test(
+    functions, fw: bool = False, bw: bool = False, cuda_graph: bool = True, **kwargs
+):
+    for k, benchmark_cls in functions.items():
+        benchmark_object = benchmark_cls(**kwargs, bw=bw)
+        label = benchmark_object.label
+        label += "fw" if fw else ""
+        label += "bw" if bw else ""
+
+        def run_one():
+            if fw:
+                benchmark_object.fw()
+            if bw:
+                benchmark_object.bw()
+
+        if cuda_graph:
+            run_one()
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                run_one()
+
+            def run_one():
+                g.replay()
+
+        yield benchmark.Timer(
+            stmt="fn()",
+            globals={
+                "fn": run_one,
+            },
+            label=label,
+            description=k,
+            sub_label=benchmark_object.sub_label,
+        )
+
+
+class AttentionDecodingFlashDecoding:
+    OP: Any = xops.fmha.flash.FwOp
+
+    def __init__(
+        self, B: int, Mq: int, Mkv: int, Hq: int, Hkv: int, K: int, bw: bool
+    ) -> None:
+        dtype = torch.float16
+        self.sub_label = f"B={B} Mq={Mq} Mkv={Mkv} Hq={Hq} Hkv={Hkv} K={K}"
+        self.label = "attn_decoding"
+        self.shapes = (B, Mq, Mkv, Hq, Hkv, K)
+
+        assert Hkv <= Hq
+        assert Hq % Hkv == 0
+        self.q = torch.randn(
+            [B, Mq, Hkv, Hq // Hkv, K], device="cuda", dtype=dtype, requires_grad=bw
+        )
+        self.k = torch.randn(
+            [B, Mkv, Hkv, 1, K], device="cuda", dtype=dtype, requires_grad=bw
+        ).expand(-1, -1, -1, Hq // Hkv, -1)
+        self.v = torch.randn(
+            [B, Mkv, Hkv, 1, K], device="cuda", dtype=dtype, requires_grad=bw
+        ).expand(-1, -1, -1, Hq // Hkv, -1)
+
+        if Hq == Hkv:
+            self.q = self.q[:, :, :, 0]
+            self.k = self.k[:, :, :, 0]
+            self.v = self.v[:, :, :, 0]
+        if Hkv == 1:
+            self.q = self.q[:, :, 0]
+            self.k = self.k[:, :, 0]
+            self.v = self.v[:, :, 0]
+
+    def fw(self) -> None:
+        xops.memory_efficient_attention_forward(self.q, self.k, self.v, op=self.OP)
+
+
+# class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
+#     OP = xops.fmha.triton_splitk.FwOp
+
+
+class AttentionDecodingCK(AttentionDecodingFlashDecoding):
+    OP = xops.fmha.ck.FwOp
+
+
+class AttentionDecodingCKDecoder(AttentionDecodingFlashDecoding):
+    OP = xops.fmha.ck_decoder.FwOp
+
+
+class AttentionDecodingPyTorchRepeat(AttentionDecodingFlashDecoding):
+    def fw(self) -> None:
+        B, Mq, Mkv, Hq, Hkv, K = self.shapes
+        scale = 1 / K**0.5
+        q = self.q.reshape([B, Mq, -1, K]).permute(0, 2, 1, 3)
+        k = self.k.reshape([B, Mkv, -1, K]).permute(0, 2, 1, 3)
+        v = self.v.reshape([B, Mkv, -1, K]).permute(0, 2, 1, 3)
+        attn = (q @ k.transpose(-1, -2)).softmax(-1) * scale
+        return attn @ v
+
+
+BENCHMARKS = {
+    "pytorch": AttentionDecodingPyTorchRepeat,
+    #"flash-decoding": AttentionDecodingFlashDecoding,
+    # "triton_splitK": AttentionDecodingSplitKV,
+    # "ck": AttentionDecodingCK,
+    "ck-decoder": AttentionDecodingCKDecoder,
+}
+
+
+try:
+    import flash_attn
+
+    class AttentionDecodingFlashAttention(AttentionDecodingFlashDecoding):
+        def fw(self) -> None:
+            q, k, v = self.q, self.k, self.v
+            if q.ndim == 5:
+                B, Mq, H1, H2, K = q.shape
+                B, Mkv, H1, H2, K = k.shape
+                q = q.reshape([B, Mq, H1 * H2, K])
+                k = k[:, :, :, 0]
+                v = v[:, :, :, 0]
+            return flash_attn.flash_attn_func(q, k, v)
+
+    BENCHMARKS[
+        f"flash-attention@{flash_attn.__version__}"
+    ] = AttentionDecodingFlashAttention
+except ImportError:
+    pass
+
+
+benchmark_main_helper2(
+    "attn_decoding",
+    fw=True,
+    cases=CASES,
+    functions=BENCHMARKS,
+    min_run_time=min_run_time,
+)
diff --git a/xformers/benchmarks/utils.py b/xformers/benchmarks/utils.py
index 0a722846b..b04889501 100644
--- a/xformers/benchmarks/utils.py
+++ b/xformers/benchmarks/utils.py
@@ -14,7 +14,7 @@
 import tempfile
 from collections import defaultdict, namedtuple
 from dataclasses import replace
-from typing import Any, Dict, Generator, List, Set, Tuple
+from typing import Any, Dict, Generator, Iterator, List, Set, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -437,6 +437,53 @@ def benchmark_main_helper(benchmark_fn, cases: List[Dict[str, Any]], **kwargs) -
     )
 
 
+def benchmark_main_helper2(
+    name: str,
+    functions,
+    fw: bool = False,
+    bw: bool = False,
+    cuda_graph: bool = True,
+    **kwargs,
+) -> None:
+    assert fw or bw
+
+    def handle_case(**case) -> Iterator[benchmark.Timer]:
+        for k, benchmark_cls in functions.items():
+            benchmark_object = benchmark_cls(**case, bw=bw)
+            label = benchmark_object.label
+            label += "fw" if fw else ""
+            label += "bw" if bw else ""
+
+            def run_one():
+                if fw:
+                    benchmark_object.fw()
+                if bw:
+                    benchmark_object.bw()
+
+            if cuda_graph:
+                run_one()
+                benchmark_object = benchmark_cls(**case, bw=bw)
+                g = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(g):
+                    run_one()
+
+                def run_one():
+                    g.replay()
+
+            yield benchmark.Timer(
+                stmt="fn()",
+                globals={
+                    "fn": run_one,
+                },
+                label=label,
+                description=k,
+                sub_label=benchmark_object.sub_label,
+            )
+
+    handle_case.__name__ = name
+    benchmark_main_helper(handle_case, **kwargs)
+
+
 def benchmark_run_and_compare(
     benchmark_fn,
     cases: List[Dict[str, Any]],

From e2dd08fc190b3cd47d775a5a092538346261ae87 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 20 Nov 2023 22:33:57 -0500
Subject: [PATCH 231/641] support None bias for ck_decoder and update benchmark

---
 .../benchmarks/benchmark_attn_decoding.py     | 15 +++++-
 xformers/csrc/attention/attention.cpp         |  2 +-
 .../hip_fmha/attention_forward_decoder.cpp    | 15 +++---
 .../hip_fmha/ck_attention_forward_decoder.h   |  2 +-
 xformers/ops/fmha/ck_decoder.py               | 48 ++++++++++++-------
 5 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index a22a4f645..75a6147c3 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -64,12 +64,14 @@ def run_one():
 class AttentionDecodingFlashDecoding:
     OP: Any = xops.fmha.flash.FwOp
 
+    label = "flash_decoding"
+
     def __init__(
         self, B: int, Mq: int, Mkv: int, Hq: int, Hkv: int, K: int, bw: bool
     ) -> None:
         dtype = torch.float16
         self.sub_label = f"B={B} Mq={Mq} Mkv={Mkv} Hq={Hq} Hkv={Hkv} K={K}"
-        self.label = "attn_decoding"
+
         self.shapes = (B, Mq, Mkv, Hq, Hkv, K)
 
         assert Hkv <= Hq
@@ -94,7 +96,10 @@ def __init__(
             self.v = self.v[:, :, 0]
 
     def fw(self) -> None:
-        xops.memory_efficient_attention_forward(self.q, self.k, self.v, op=self.OP)
+        try:
+            xops.memory_efficient_attention_forward(self.q, self.k, self.v, op=self.OP)
+        except RuntimeError as e:
+            print(e.__cause__)
 
 
 # class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
@@ -102,14 +107,20 @@ def fw(self) -> None:
 
 
 class AttentionDecodingCK(AttentionDecodingFlashDecoding):
+    label = "ck"
+
     OP = xops.fmha.ck.FwOp
 
 
 class AttentionDecodingCKDecoder(AttentionDecodingFlashDecoding):
+    label = "ck_decoder"
+
     OP = xops.fmha.ck_decoder.FwOp
 
 
 class AttentionDecodingPyTorchRepeat(AttentionDecodingFlashDecoding):
+    label = "pytorch"
+
     def fw(self) -> None:
         B, Mq, Mkv, Hq, Hkv, K = self.shapes
         scale = 1 / K**0.5
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index b3fdde526..d243a0616 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -45,7 +45,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
           "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor seq_positions, float scale) -> Tensor"));
+          "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 79fb68368..7358ed411 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -57,7 +57,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_kv_lens, // [B]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     at::Tensor& O) {
   static_assert(4 * ThreadsPerWavefront == D_H, "");
@@ -68,7 +68,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
   TORCH_CHECK(cache_K.is_cuda());
   TORCH_CHECK(cache_V.is_cuda());
 
-  TORCH_CHECK(seq_kv_lens.is_cuda());
+  TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
 
   TORCH_CHECK(cache_K.size(1) <= T_MAX);
   TORCH_CHECK(cache_K.size(3) <= D_H);
@@ -109,15 +109,14 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
         auto V_acc =
             cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
         auto O_acc = O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
-        auto seq_acc =
-            seq_kv_lens
-                .packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
+        auto seq_acc = seq_kv_lens ?
+            seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data() : nullptr;
         auto arg = device_op_t::Argument(
             reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
             reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
             reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
             reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-            seq_acc.data(),
+            seq_acc,
             XQ_acc.stride(0),
             XQ_acc.stride(1),
             XQ_acc.stride(2),
@@ -146,7 +145,7 @@ at::Tensor efficient_attention_forward_decoder_ck_impl(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_kv_lens, // [B]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale) {
   auto O = at::empty_like(XQ);
   efficient_attention_forward_decoder_ck_out_impl<
@@ -159,7 +158,7 @@ at::Tensor efficient_attention_forward_decoder_ck(
     const at::Tensor& XQ, // [B, 1, H, D]
     const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
     const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_kv_lens, // [B]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale) {
   return efficient_attention_forward_decoder_ck_impl<
       kThreadsPerWavefront,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 052a1d808..4f0f3921e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -131,7 +131,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   // Note: this is decoding case where we attend to current and all previous
   // tokens.
-  const int32_t t_max = seq_kv_lens[b];
+  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : gridDim.x;
 
   const int32_t lane_idx = threadIdx.x;
   const int32_t wavefront_idx = threadIdx.y;
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index ad131faf4..9efad083c 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -16,7 +16,7 @@ class FwOp(AttentionFwOpBase):
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16, torch.float}
     SUPPORTED_MAX_K: int = 256
-    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {BlockDiagonalCausalWithOffsetPaddedKeysMask}
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {type(None), BlockDiagonalCausalWithOffsetPaddedKeysMask}
     SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
     NAME = "ck_decoderF"
@@ -73,25 +73,37 @@ def apply(
         cls, inp: Inputs, needs_gradient: bool
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         if needs_gradient:
-            raise NotImplementedError("gradient")
+            raise NotImplementedError("backward pass is not supported")
         attn_bias = inp.attn_bias
-        assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
 
-        attn_bias.k_seqinfo.to(inp.query.device)
-        attn_bias.q_seqinfo.to(inp.query.device)
-
-        padding = attn_bias.k_seqinfo.padding
-        multiquery = inp.key.stride(2) == 0
-        if multiquery:
-            key = inp.key[0, :, :1].unflatten(0, (-1, padding))
-            value = inp.value[0, :, :1].unflatten(0, (-1, padding))
+        if attn_bias is not None:
+            attn_bias.k_seqinfo.to(inp.key.device)
+            attn_bias.q_seqinfo.to(inp.query.device)
+            padding = attn_bias.k_seqinfo.padding
+            seq_positions_gpu = attn_bias.k_seqinfo.seqlen
         else:
-            key = inp.key[0].unflatten(0, (-1, padding))
-            value = inp.value[0].unflatten(0, (-1, padding))
-
-        seq_positions = attn_bias.k_seqinfo.seqlen
-
-        query = inp.query[0].unflatten(0, (key.shape[0], -1))
+            padding = inp.key.shape[1]
+            seq_positions_gpu = None
+
+        if attn_bias is not None:
+            # key: (1, B * padding, 1 if multiquery else Hkv, D)
+            # value: like key
+            # query: (1, B * q_seqlen, Hq, D)
+            multiquery = inp.key.stride(2) == 0
+            if multiquery:
+                key = inp.key[0, :, :1].unflatten(0, (-1, padding))
+                value = inp.value[0, :, :1].unflatten(0, (-1, padding))
+            else:
+                key = inp.key[0].unflatten(0, (-1, padding))
+                value = inp.value[0].unflatten(0, (-1, padding))
+            query = inp.query[0].unflatten(0, (key.shape[0], -1))    
+        else:
+            # key: (B, padding, 1 if multiquery else Hkv, D)
+            # value: like key
+            # query: (B, q_seqlen, Hq, D)
+            key = inp.key
+            query = inp.query
+            value = inp.value
 
         if inp.scale is not None:
             qk_scale = inp.scale
@@ -102,7 +114,7 @@ def apply(
             query=query,
             key=key,
             value=value,
-            seq_positions=seq_positions,
+            seq_positions=seq_positions_gpu,
             scale=qk_scale,
         )
         return out, None

From 4b711be5ca1d3b3cc3eb2e45c628cd30585e6802 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 20 Nov 2023 22:45:32 -0500
Subject: [PATCH 232/641] improve benchmark results printing

---
 xformers/benchmarks/benchmark_attn_decoding.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 75a6147c3..1a729a645 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -99,7 +99,7 @@ def fw(self) -> None:
         try:
             xops.memory_efficient_attention_forward(self.q, self.k, self.v, op=self.OP)
         except RuntimeError as e:
-            print(e.__cause__)
+            print(f"Runtime error: {e}")
 
 
 # class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
@@ -107,19 +107,16 @@ def fw(self) -> None:
 
 
 class AttentionDecodingCK(AttentionDecodingFlashDecoding):
-    label = "ck"
 
     OP = xops.fmha.ck.FwOp
 
 
 class AttentionDecodingCKDecoder(AttentionDecodingFlashDecoding):
-    label = "ck_decoder"
 
     OP = xops.fmha.ck_decoder.FwOp
 
 
 class AttentionDecodingPyTorchRepeat(AttentionDecodingFlashDecoding):
-    label = "pytorch"
 
     def fw(self) -> None:
         B, Mq, Mkv, Hq, Hkv, K = self.shapes
@@ -135,7 +132,7 @@ def fw(self) -> None:
     "pytorch": AttentionDecodingPyTorchRepeat,
     #"flash-decoding": AttentionDecodingFlashDecoding,
     # "triton_splitK": AttentionDecodingSplitKV,
-    # "ck": AttentionDecodingCK,
+    "ck": AttentionDecodingCK,
     "ck-decoder": AttentionDecodingCKDecoder,
 }
 

From 7497514638cb3397b2548ac998899608af32e235 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 20 Nov 2023 23:37:25 -0500
Subject: [PATCH 233/641] fix Mkv when bias is none for ck decoder

---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp  | 1 +
 .../csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 7358ed411..42de5a540 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -123,6 +123,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
             K_acc.stride(0),
             K_acc.stride(1),
             K_acc.stride(2),
+            K_acc.size(1),
             K_acc.size(3),
             K_acc.size(2) == 1,
             qk_scale,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 4f0f3921e..eaf8f0bc5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -119,6 +119,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
     const ptrdiff_t K_stride_0,
     const ptrdiff_t K_stride_1,
     const ptrdiff_t K_stride_2,
+    const int32_t K_size_1,
     const int32_t D_H,
     const bool multiquery,
     const float qk_scale) {
@@ -131,7 +132,7 @@ __global__ void efficient_attention_forward_decoder_ck_kernel(
 
   // Note: this is decoding case where we attend to current and all previous
   // tokens.
-  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : gridDim.x;
+  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_1;
 
   const int32_t lane_idx = threadIdx.x;
   const int32_t wavefront_idx = threadIdx.y;
@@ -392,6 +393,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
     const ptrdiff_t K_stride_0;
     const ptrdiff_t K_stride_1;
     const ptrdiff_t K_stride_2;
+    const int32_t K_size_1;
     const int32_t D_H;
     const bool multiquery;
     const float qk_scale;
@@ -412,6 +414,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
         const ptrdiff_t K_stride_0,
         const ptrdiff_t K_stride_1,
         const ptrdiff_t K_stride_2,
+        const int32_t K_size_1,
         const int32_t D_H,
         const bool multiquery,
         const float qk_scale,
@@ -429,6 +432,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           K_stride_0(K_stride_0),
           K_stride_1(K_stride_1),
           K_stride_2(K_stride_2),
+          K_size_1(K_size_1),
           D_H(D_H),
           multiquery(multiquery),
           qk_scale(qk_scale),
@@ -483,6 +487,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           arg.K_stride_0,
           arg.K_stride_1,
           arg.K_stride_2,
+          arg.K_size_1,
           arg.D_H,
           arg.multiquery,
           arg.qk_scale);

From 75a95fd27c2a75c302ee99ebaf791d4f1c8113e3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 21 Nov 2023 15:39:03 +0000
Subject: [PATCH 234/641] Remove composable_kernel_tiled for easy access (use
 ck-tiled branch for ck-tiled integration)

---
 .gitmodules | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index dd09e4429..94eb8135c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,6 +8,3 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
-[submodule "third_party/composable_kernel_tiled"]
-	path = third_party/composable_kernel_tiled
-	url = https://github.com/asroy/ck_tile.git

From 0b495cefd1e23f6322c25019ba6bd1db6c59b75a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 21 Nov 2023 15:51:24 +0000
Subject: [PATCH 235/641] Remove third_party/composable_kernel_tiled

---
 third_party/composable_kernel_tiled | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 third_party/composable_kernel_tiled

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
deleted file mode 160000
index 496be40ef..000000000
--- a/third_party/composable_kernel_tiled
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 496be40efde65ace153fe53ec9a3865828f2d3cc

From 29843e6693271caeec9e2500d903d7e5dbe98c40 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 21 Nov 2023 20:11:42 +0000
Subject: [PATCH 236/641] Tiny fix in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c9bfb35f3..a11c98737 100644
--- a/setup.py
+++ b/setup.py
@@ -322,7 +322,7 @@ def get_extensions():
 
        if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
            include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include',
-                             Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']
+                             Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include' / 'ck']
        else:
            include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include', 
                              Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']

From 53107386991f08752d35d737afffda57b5ca5757 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 21 Nov 2023 20:11:42 +0000
Subject: [PATCH 237/641] Tiny fix in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c9bfb35f3..a11c98737 100644
--- a/setup.py
+++ b/setup.py
@@ -322,7 +322,7 @@ def get_extensions():
 
        if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
            include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include',
-                             Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']
+                             Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include' / 'ck']
        else:
            include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include', 
                              Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']

From d75a1810a221ee7138702cd52f52b000779a6050 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 22 Nov 2023 17:07:26 +0000
Subject: [PATCH 238/641] Add initial implementation of using ck-tiled FA for
 batched infer for fp16

---
 .gitignore                                    |   3 +-
 .gitmodules                                   |   3 +
 third_party/composable_kernel_tiled           |   1 +
 .../attention_forward_generic_ck_tiled.cpp    |  35 ++-
 .../ck_tiled_fmha_batched_forward_kernel.h    | 220 ++++++++++++++++++
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 154 +++++++++++-
 .../ck_tiled_fmha_batched_infer_bp16.cpp      |  58 -----
 .../hip_fmha/ck_tiled_fmha_fwd_epilogue.h     |  34 +++
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |  46 ++++
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |   1 +
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      |  58 -----
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |   8 -
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |   8 -
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |   8 -
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |   8 -
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |   8 -
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |   8 -
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |   2 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   2 +-
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |   2 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   2 +-
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |   2 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   2 +-
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |   8 -
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |   8 -
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |   8 -
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |   8 -
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |   8 -
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |   8 -
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |   2 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   2 +-
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |   2 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   2 +-
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |   2 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   2 +-
 35 files changed, 498 insertions(+), 235 deletions(-)
 create mode 160000 third_party/composable_kernel_tiled
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp

diff --git a/.gitignore b/.gitignore
index 96cc37bb0..8c6455c1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,5 +67,6 @@ xformers/csrc/attention/hip_fmha/*.hip
 xformers/csrc/attention/hip_fmha/*_hip.h
 xformers/csrc/attention/hip_fmha/instances/*.cu
 xformers/csrc/attention/hip_fmha/instances/*.hip
-
+xformers/csrc/attention/hip_fmha/instances_tiled/*.cu
+xformers/csrc/attention/hip_fmha/instances_tiled/*.hip
 
diff --git a/.gitmodules b/.gitmodules
index 94eb8135c..bbbf0f197 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,3 +8,6 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
+[submodule "third_party/composable_kernel_tiled"]
+	path = third_party/composable_kernel_tiled
+	url = https://github.com/asroy/ck_tile
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
new file mode 160000
index 000000000..0a7174ad8
--- /dev/null
+++ b/third_party/composable_kernel_tiled
@@ -0,0 +1 @@
+Subproject commit 0a7174ad864cda7f59c1e8f5ccefee3359c88978
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 8cd17ad84..c1435bb5c 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -31,9 +31,11 @@ extern void grouped_forward_bp16(
 */
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
-extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
+// extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t
+// stream);
 extern void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream);
-extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream);
+// extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t
+// stream);
 
 namespace {
 
@@ -94,6 +96,9 @@ efficient_attention_forward_ck(
     TORCH_CHECK(max_seqlen_q_.has_value());
   };
 
+  if (seqstart_q.has_value())
+    throw std::runtime_error("Grouped mode is ready by current ck-tiled!");
+
   // last dim is contiguous, device is kCUDA
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
@@ -183,6 +188,7 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      /*
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
@@ -195,11 +201,18 @@ efficient_attention_forward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
+      */
+
+      throw std::runtime_error("bias is currently not supported by ck-tiled!");
     } else
       p.has_attn_bias = false;
 
     p.custom_mask_type = custom_mask_type;
 
+    if (p.custom_mask_type != 0)
+      throw std::runtime_error(
+          "causal mask-type is currently not supported by ck-tiled!");
+
     p.use_dropout = use_dropout;
     p.philox_seed = philox_seed;
     p.philox_offset = philox_offset;
@@ -257,6 +270,7 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
+      /*
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
@@ -267,11 +281,17 @@ efficient_attention_forward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
+       */
+      throw std::runtime_error("bias is currently not supported by ck-tiled!");
     } else
       p.has_attn_bias = false;
 
     p.custom_mask_type = custom_mask_type;
 
+    if (p.custom_mask_type != 0)
+      throw std::runtime_error(
+          "causal mask-type is currently not supported by ck-tiled!");
+
     // max_seqlen_q is used to create logsumexp tensor
     p.max_seqlen_q = *max_seqlen_q_;
 
@@ -327,6 +347,7 @@ efficient_attention_forward_ck(
       p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
 
       if (bias.has_value()) {
+        /*
         size_t tmp_bias_offset = get_size_in_bytes(
             static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
                 static_cast<size_t>(p.host_seqstart_k[i]) *
@@ -335,6 +356,10 @@ efficient_attention_forward_ck(
 
         p.attn_bias_ptrs.push_back(
             reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+        */
+
+        throw std::runtime_error(
+            "bias is currently not supported by ck-tiled!");
       };
 
       // ToDO: remove this after dev-op fix
@@ -385,7 +410,8 @@ efficient_attention_forward_ck(
       if (inDataType == at::ScalarType::Half) {
         batched_infer_fp16(batched_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_infer_bp16(batched_forward_params, stream);
+        // batched_infer_bp16(batched_forward_params, stream);
+        throw std::runtime_error("input data-type is not supported!");
       } else
         throw std::runtime_error("input data-type is not supported!");
     } else {
@@ -410,7 +436,8 @@ efficient_attention_forward_ck(
       if (inDataType == at::ScalarType::Half) {
         grouped_infer_fp16(grouped_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_infer_bp16(grouped_forward_params, stream);
+        // grouped_infer_bp16(grouped_forward_params, stream);
+        throw std::runtime_error("input data-type is not supported!");
       } else
         throw std::runtime_error("input data-type is not supported!");
     } else {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h
new file mode 100644
index 000000000..2cb0d1aea
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h
@@ -0,0 +1,220 @@
+#pragma once
+
+#include "ck/tensor/tensor_view.hpp"
+#include "ck/tile_program/tile/tile_window.hpp"
+#include "ck/utility/common_header.hpp"
+
+// S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] * K[seqlen_k, hdim_q]
+// P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
+// O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
+
+#define C_LOG2E 1.44269504088896340736 // log2(e)
+
+template <
+    typename TilePartitioner_,
+    typename FmhaPipeline_,
+    typename EpiloguePipeline_>
+struct FmhaFwdKernel {
+  using TilePartitioner = ck::remove_cvref_t<TilePartitioner_>;
+  using FmhaPipeline = ck::remove_cvref_t<FmhaPipeline_>;
+  using EpiloguePipeline = ck::remove_cvref_t<EpiloguePipeline_>;
+  static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
+
+  using QDataType = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
+  using KDataType = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
+  using VDataType = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
+  using ODataType = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
+
+  using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
+
+  struct Kargs {
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    void* o_ptr;
+    ck::index_t seqlen_q;
+    ck::index_t seqlen_k;
+    ck::index_t hdim_q;
+    ck::index_t hdim_v;
+
+    float scale;
+
+    ck::index_t stride_q;
+    ck::index_t stride_k;
+    ck::index_t stride_v;
+    ck::index_t stride_o;
+
+    ck::index_t nhead_stride_q;
+    ck::index_t nhead_stride_k;
+    ck::index_t nhead_stride_v;
+    ck::index_t nhead_stride_o;
+
+    ck::index_t batch_stride_q;
+    ck::index_t batch_stride_k;
+    ck::index_t batch_stride_v;
+    ck::index_t batch_stride_o;
+  };
+
+  __host__ static constexpr Kargs MakeKargs(
+      const void* q_ptr,
+      const void* k_ptr,
+      const void* v_ptr,
+      void* o_ptr,
+      ck::index_t seqlen_q,
+      ck::index_t seqlen_k,
+      ck::index_t hdim_q,
+      ck::index_t hdim_v,
+      float scale,
+      ck::index_t stride_q,
+      ck::index_t stride_k,
+      ck::index_t stride_v,
+      ck::index_t stride_o,
+      ck::index_t nhead_stride_q,
+      ck::index_t nhead_stride_k,
+      ck::index_t nhead_stride_v,
+      ck::index_t nhead_stride_o,
+      ck::index_t batch_stride_q,
+      ck::index_t batch_stride_k,
+      ck::index_t batch_stride_v,
+      ck::index_t batch_stride_o) {
+    return Kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,
+                 seqlen_q,       seqlen_k,       hdim_q,         hdim_v,
+                 scale,          stride_q,       stride_k,       stride_v,
+                 stride_o,       nhead_stride_q, nhead_stride_k, nhead_stride_v,
+                 nhead_stride_o, batch_stride_q, batch_stride_k, batch_stride_v,
+                 batch_stride_o};
+  }
+
+  __host__ static constexpr auto GridSize(
+      ck::index_t batch_size_,
+      ck::index_t nhead_,
+      ck::index_t seqlen_q_,
+      ck::index_t hdim_v_) {
+    return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
+  }
+
+  __host__ static constexpr auto BlockSize() {
+    return dim3(kBlockSize);
+  }
+
+  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
+    return ck::math::max(
+        FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+  }
+
+  __device__ void operator()(Kargs kargs) const {
+    using namespace ck;
+    using namespace ck::tile_program;
+    using namespace ck::tile_program::block;
+
+    // allocate LDS
+    __shared__ char smem_ptr[GetSmemSize()];
+
+    // divide problem
+    const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
+        TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
+
+    const index_t i_m0 =
+        __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+    const index_t i_n1 =
+        __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+    // for simplicity, batch stride we just modify the pointer
+    const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+        i_nhead * kargs.nhead_stride_q + i_batch * kargs.batch_stride_q;
+    const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+        i_nhead * kargs.nhead_stride_k + i_batch * kargs.batch_stride_k;
+    const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+        i_nhead * kargs.nhead_stride_v + i_batch * kargs.batch_stride_v;
+    ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+        i_nhead * kargs.nhead_stride_o + i_batch * kargs.batch_stride_o;
+
+    // Q/K/V DRAM and DRAM window
+    const auto q_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+        q_ptr,
+        make_tuple(kargs.seqlen_q, kargs.hdim_q),
+        make_tuple(kargs.stride_q, 1),
+        Number<32>{},
+        Number<1>{});
+
+    const auto k_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+        k_ptr,
+        make_tuple(kargs.seqlen_k, kargs.hdim_q),
+        make_tuple(kargs.stride_k, 1),
+        Number<32>{},
+        Number<1>{});
+
+    const auto v_dram = [&]() {
+      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
+        const auto v_dram_tmp =
+            make_naive_tensor_view<AddressSpaceEnum::Global>(
+                v_ptr,
+                make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                make_tuple(kargs.stride_v, 1),
+                Number<32>{},
+                Number<1>{});
+        return transform_tensor_view(
+            v_dram_tmp,
+            make_tuple(
+                make_pass_through_transform(kargs.hdim_v),
+                make_pass_through_transform(kargs.seqlen_k)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+      } else {
+        return make_naive_tensor_view<AddressSpaceEnum::Global>(
+            v_ptr,
+            make_tuple(kargs.hdim_v, kargs.seqlen_k),
+            make_tuple(kargs.stride_v, 1),
+            Number<32>{},
+            Number<1>{});
+      }
+    }();
+
+    auto q_dram_window = make_tile_window(
+        q_dram,
+        [&]() {
+          if constexpr (FmhaPipeline::kQLoadOnce)
+            return make_tuple(
+                Number<FmhaPipeline::kM0>{},
+                Number<FmhaPipeline::kK0BlockLength>{});
+          else
+            return make_tuple(
+                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
+        }(),
+        {i_m0, 0});
+
+    auto k_dram_window = make_tile_window(
+        k_dram,
+        make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
+        {0, 0});
+
+    auto v_dram_window = make_tile_window(
+        v_dram,
+        make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+        {i_n1, 0});
+
+    auto o_acc_tile = FmhaPipeline{}(
+        q_dram_window,
+        k_dram_window,
+        v_dram_window,
+        kargs.scale,
+        kargs.seqlen_k / FmhaPipeline::kN0,
+        kargs.hdim_q / FmhaPipeline::kK0,
+        smem_ptr);
+
+    // O DRAM and O DRAM window
+    auto o_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+        o_ptr,
+        make_tuple(kargs.seqlen_q, kargs.hdim_v),
+        make_tuple(kargs.stride_o, 1),
+        Number<32>{},
+        Number<1>{});
+
+    auto o_dram_window = make_tile_window(
+        o_dram,
+        make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
+        {i_m0, i_n1});
+
+    EpiloguePipeline{}(o_dram_window, o_acc_tile);
+  }
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 9aa37d9b8..4b255f573 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -3,18 +3,160 @@
 #include <sstream>
 #include <stdexcept>
 
-#include <ck/ck.hpp>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/tensor/tensor_view.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/utility/common_header.hpp"
+
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs_default_policy.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
 #include "ck_fmha_params.h"
+#include "ck_tiled_fmha_batched_forward_kernel.h"
+#include "ck_tiled_fmha_fwd_epilogue.h"
+#include "ck_tiled_fmha_fwd_tile_partitioner.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 struct batched_infer_masktype_attnbias_dispatched {
-  static void Run(BatchedForwardParams& param, hipStream_t stream){};
+  using QDataType = scalar_t;
+  using KDataType = scalar_t;
+  using VDataType = scalar_t;
+  using SaccDataType = float; // data type for first gemm accumulation
+  using SMPLComputeDataType = float; // data type for reduction, softmax
+  using PDataType = scalar_t; // data type for A matrix of second gemm
+  using OaccDataType = float; // data type for second gemm accumulation
+  using ODataType = scalar_t;
+
+  using VLayout = ck::tensor_layout::gemm::RowMajor;
+
+  using FmhaBlockTileHdim64 = ck::Sequence<128, 64, 32, 64, 32, 64>;
+  using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
+  using FmhaBlockWarps = ck::Sequence<4, 1, 1>;
+  using FmhaWarpTile = ck::Sequence<32, 32, 16>;
+  using FmhaShapeHDim64 = ck::tile_program::TileFmhaShape<
+      FmhaBlockTileHdim64,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      VLayout>;
+  using FmhaShapeHDim128 = ck::tile_program::TileFmhaShape<
+      FmhaBlockTileHdim128,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      VLayout>;
+
+  using FmhaTilePartitionerHDim64 = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
+  using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
+  using FmhaPipelineProblemHDim64 =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          QDataType,
+          KDataType,
+          VDataType,
+          SaccDataType,
+          SMPLComputeDataType,
+          PDataType,
+          OaccDataType,
+          ODataType,
+          256, // BlockSize
+          FmhaShapeHDim64>;
+  using FmhaPipelineProblemHDim128 =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          QDataType,
+          KDataType,
+          VDataType,
+          SaccDataType,
+          SMPLComputeDataType,
+          PDataType,
+          OaccDataType,
+          ODataType,
+          256, // BlockSize
+          FmhaShapeHDim128>;
+
+  using FmhaPipelineHDim64 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+      FmhaPipelineProblemHDim64>;
+  using FmhaPipelineHDim128 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+      FmhaPipelineProblemHDim128>;
+
+  using FmhaEpilogue =
+      FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
+  using FmhaKernelHDim64 = FmhaFwdKernel<
+      FmhaTilePartitionerHDim64,
+      FmhaPipelineHDim64,
+      FmhaEpilogue>;
+  using FmhaKernelHDim128 = FmhaFwdKernel<
+      FmhaTilePartitionerHDim128,
+      FmhaPipelineHDim128,
+      FmhaEpilogue>;
+
+#ifndef BATCHED_INFER_HEADDIM_SWITCH
+#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)  \
+  [&] {                                                          \
+    if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64) {             \
+      using FmhaKernel = FmhaKernelHDim64;                       \
+      __VA_ARGS__();                                             \
+    } else if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128) {     \
+      using FmhaKernel = FmhaKernelHDim128;                      \
+      __VA_ARGS__();                                             \
+    } else {                                                     \
+      throw std::runtime_error("Head-dim sizes not supported!"); \
+    }                                                            \
+  }()
+#endif
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    BATCHED_INFER_HEADDIM_SWITCH(
+        param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
+  };
+
+  template <typename FmhaKernel>
+  static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream) {
+    dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
+    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+
+    constexpr ck::index_t kWarpPerCu = 8; // 2 warps per SIMD
+    constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
+    constexpr ck::index_t kBlockPerCu = kWarpPerCu / kWarpPerBlock;
+
+    auto kargs = FmhaKernel::MakeKargs(
+        param.q_ptr,
+        param.k_ptr,
+        param.v_ptr,
+        param.out_ptr,
+        param.M, // seqlen_q
+        param.N, // seqlen_k
+        param.K, // hdim_q
+        param.Kv, // hdim_v
+        param.scale,
+        param.q_strides[1], // q, k, v, out tensor seq-dim stride
+        param.k_strides[1],
+        param.v_strides[1],
+        param.out_strides[1],
+        param.q_strides[2], // q, k, v, out tensor head-dim stride
+        param.k_strides[2],
+        param.v_strides[2],
+        param.out_strides[2],
+        param.q_strides[0], // q, k, v, out tensor batch-dim stride
+        param.k_strides[0],
+        param.v_strides[0],
+        param.out_strides[0]);
 
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(
-      BatchedForwardParams& param,
-      hipStream_t stream){};
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
deleted file mode 100644
index 81ff5b915..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_tiled_fmha_batched_infer.h"
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
new file mode 100644
index 000000000..4073424fc
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "ck/tile_program/tile/store_tile.hpp"
+#include "ck/tile_program/tile/tile_elementwise.hpp"
+#include "ck/utility/common_header.hpp"
+
+template <typename OaccDataType_, typename ODataType_>
+struct FmhaFwdEpilogueProblem {
+  using OaccDataType = ck::remove_cvref_t<OaccDataType_>;
+  using ODataType = ck::remove_cvref_t<ODataType_>;
+};
+
+template <typename Problem_, typename Policy_ = void>
+struct FmhaFwdEpilogue {
+  using Problem = ck::remove_cvref_t<Problem_>;
+  using OaccDataType = ck::remove_cvref_t<typename Problem::OaccDataType>;
+  using ODataType = ck::remove_cvref_t<typename Problem::ODataType>;
+
+  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
+    return 0;
+  }
+
+  template <typename ODramWindowTmp, typename OAccTile>
+  __device__ auto operator()(
+      ODramWindowTmp& o_dram_window_tmp,
+      const OAccTile& o_acc_tile) {
+    using namespace ck;
+    using namespace ck::tile_program;
+
+    const auto o =
+        tile_elementwise_in(type_convert<ODataType, OaccDataType>, o_acc_tile);
+    store_tile(o_dram_window_tmp, o);
+  }
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
new file mode 100644
index 000000000..113037ce3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "ck/tile_program/tile/store_tile.hpp"
+#include "ck/tile_program/tile/tile_elementwise.hpp"
+#include "ck/utility/common_header.hpp"
+
+template <typename BlockFmhaShape_>
+struct FmhaFwdTilePartitioner {
+  using BlockFmhaShape = ck::remove_cvref_t<BlockFmhaShape_>;
+
+  static constexpr ck::index_t kM0 = BlockFmhaShape::kM0;
+  static constexpr ck::index_t kN0 = BlockFmhaShape::kN0;
+  static constexpr ck::index_t kK0 = BlockFmhaShape::kK0;
+  static constexpr ck::index_t kN1 = BlockFmhaShape::kN1;
+  static constexpr ck::index_t kK1 = BlockFmhaShape::kK1;
+
+  __host__ static constexpr auto GridSize(
+      ck::index_t batch_size_,
+      ck::index_t nhead_,
+      ck::index_t seqlen_q_,
+      ck::index_t hdim_v_) {
+    // TODO: this may need tuning
+    return dim3((seqlen_q_ / kM0) * (hdim_v_ / kN1), batch_size_, nhead_);
+  }
+
+  __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v) {
+    using namespace ck;
+
+    // const index_t num_tile_m0 = seqlen_q / kM0;
+    const index_t num_tile_n1 = hdim_v / kN1;
+
+    const index_t i_block = blockIdx.x;
+    const index_t i_batch = blockIdx.y;
+    const index_t i_nhead = blockIdx.z;
+
+    const auto f = [](index_t dividend, index_t divisor) {
+      index_t quotient = dividend / divisor;
+      index_t modulus = dividend - quotient * divisor;
+      return ck::make_tuple(quotient, modulus);
+    };
+
+    const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+    return ck::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+  }
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index b3d3b159b..f52884e27 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -4,6 +4,7 @@
 #include <stdexcept>
 
 #include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
 
 #include "ck_fmha_params.h"
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
deleted file mode 100644
index bdfce5854..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_tiled_fmha_grouped_infer.h"
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 9748955e1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 418f925c2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index a7cdb48b8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 578855b9b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 35e9bca9c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index e27e3b5ff..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
index 5c83b0abd..e9959f237 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_batched_infer.h"
+#include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
index 11c76b35f..6c46ed45f 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_batched_infer.h"
+#include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
index b13f5a4c9..aefdd2804 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_batched_infer.h"
+#include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
index 12f5991c4..61b94d6ad 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_batched_infer.h"
+#include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
index 8d45859e5..720a9c2fc 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_batched_infer.h"
+#include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
index 9f03be2b5..75daaaa07 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_batched_infer.h"
+#include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 973213413..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 96e0ba425..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 332724e73..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index cb1120f5b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 51ed70cab..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index c157e89c1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
index bbcd3ab0e..96d0f992e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_grouped_infer.h"
+#include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
index e320f5de6..adeee9880 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_grouped_infer.h"
+#include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
index e763dde6a..f3843a8ed 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_grouped_infer.h"
+#include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
index 3ec2d41da..bae1535a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_grouped_infer.h"
+#include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
index dee7a0845..768082654 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_grouped_infer.h"
+#include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
index b5515e9a0..ac11a4eea 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,6 +1,6 @@
 #include <ck/ck.hpp>
 
-#include "ck_fmha_grouped_infer.h"
+#include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,

From e5d7f7af5045b484a971e9e38339035c2a1c5dd7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 23 Nov 2023 19:12:16 +0000
Subject: [PATCH 239/641] Add HIP_CALL_CHECK to the fmha utility header

---
 xformers/csrc/attention/hip_fmha/ck_fmha_util.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 84e185967..78a88e556 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -139,3 +139,15 @@ inline at::Tensor get_bias_4d_view(
       TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
   }
 }
+
+#define HIP_CALL_CHECK(flag)                                                 \
+  do {                                                                       \
+    hipError_t _tmpVal;                                                      \
+    if ((_tmpVal = flag) != hipSuccess) {                                    \
+      std::ostringstream ostr;                                               \
+      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+           << hipGetErrorString(_tmpVal);                                    \
+      throw std::runtime_error(ostr.str());                                  \
+    }                                                                        \
+  } while (0)
+~

From 2ee378079a6f13b21bbe34ca4ef6df848c03e363 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 23 Nov 2023 19:17:52 +0000
Subject: [PATCH 240/641] Tiny fix to the including

---
 setup.py                                      |  6 ++---
 .../hip_fmha/ck_fmha_batched_backward.h       |  2 +-
 .../ck_fmha_batched_backward_bp16.cpp         |  2 +-
 .../ck_fmha_batched_backward_fp16.cpp         |  2 +-
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 23 +++++++++----------
 5 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/setup.py b/setup.py
index a11c98737..9f21987ad 100644
--- a/setup.py
+++ b/setup.py
@@ -321,11 +321,9 @@ def get_extensions():
        include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha' ]
 
        if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
-           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include',
-                             Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include' / 'ck']
+           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include']
        else:
-           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include', 
-                             Path(this_dir) / 'third_party' / 'composable_kernel' / 'include' / 'ck']
+           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include']
            
        generator_flag = []
        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 1663e9c52..9293d4d4f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -3,7 +3,7 @@
 #include <sstream>
 #include <stdexcept>
 
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp>
 #include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 441a4f9cf..319b039b9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -1,4 +1,4 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index 1868a5957..2bcf0653d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -1,4 +1,4 @@
-#include <ck.hpp>
+#include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 78a88e556..5de869db0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -78,6 +78,17 @@ struct CkToAtenDtype<float> {
   XFORMERS_CHECK(                                                         \
       TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
 
+#define HIP_CALL_CHECK(flag)                                                 \
+  do {                                                                       \
+    hipError_t _tmpVal;                                                      \
+    if ((_tmpVal = flag) != hipSuccess) {                                    \
+      std::ostringstream ostr;                                               \
+      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+           << hipGetErrorString(_tmpVal);                                    \
+      throw std::runtime_error(ostr.str());                                  \
+    }                                                                        \
+  } while (0)
+
 static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype) {
   if (dtype == at::ScalarType::Float) {
     return n * 4;
@@ -139,15 +150,3 @@ inline at::Tensor get_bias_4d_view(
       TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
   }
 }
-
-#define HIP_CALL_CHECK(flag)                                                 \
-  do {                                                                       \
-    hipError_t _tmpVal;                                                      \
-    if ((_tmpVal = flag) != hipSuccess) {                                    \
-      std::ostringstream ostr;                                               \
-      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
-           << hipGetErrorString(_tmpVal);                                    \
-      throw std::runtime_error(ostr.str());                                  \
-    }                                                                        \
-  } while (0)
-~

From a34bf6d50a99c330b71f3f8901c27a79c824b127 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 23 Nov 2023 23:47:21 +0000
Subject: [PATCH 241/641] Add implementation of using ck-tiled FA for grouped
 infer with bias for fp16

---
 .../attention_forward_generic_ck_tiled.cpp    | 111 ++---
 .../ck_tiled_fmha_batched_forward_kernel.h    | 220 ---------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  34 +-
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 456 ++++++++++++++++++
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 170 ++++++-
 .../attention/hip_fmha/ck_tiled_fmha_params.h | 207 ++++++++
 6 files changed, 889 insertions(+), 309 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index c1435bb5c..8961bb4ea 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -12,8 +12,8 @@
 #include <torch/library.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 
-#include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
+#include "ck_tiled_fmha_params.h"
 
 /*
 extern void batched_forward_fp16(
@@ -96,9 +96,6 @@ efficient_attention_forward_ck(
     TORCH_CHECK(max_seqlen_q_.has_value());
   };
 
-  if (seqstart_q.has_value())
-    throw std::runtime_error("Grouped mode is ready by current ck-tiled!");
-
   // last dim is contiguous, device is kCUDA
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
@@ -188,7 +185,6 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
-      /*
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
@@ -201,9 +197,6 @@ efficient_attention_forward_ck(
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
-      */
-
-      throw std::runtime_error("bias is currently not supported by ck-tiled!");
     } else
       p.has_attn_bias = false;
 
@@ -252,6 +245,11 @@ efficient_attention_forward_ck(
       p.scale = float(1.0 / std::sqrt(float(K)));
     }
 
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
     p.q_strides = {
         static_cast<int>(query.stride(1)),
         static_cast<int>(query.stride(2)),
@@ -270,19 +268,18 @@ efficient_attention_forward_ck(
         static_cast<int>(out.stride(3))};
 
     if (bias.has_value()) {
-      /*
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
       TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
       p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
+
       const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
       p.attn_bias_strides = {
           static_cast<int>(bias_4d_view.stride(0)),
           static_cast<int>(bias_4d_view.stride(1)),
           static_cast<int>(bias_4d_view.stride(2)),
           static_cast<int>(bias_4d_view.stride(3))};
-       */
-      throw std::runtime_error("bias is currently not supported by ck-tiled!");
     } else
       p.has_attn_bias = false;
 
@@ -295,16 +292,27 @@ efficient_attention_forward_ck(
     // max_seqlen_q is used to create logsumexp tensor
     p.max_seqlen_q = *max_seqlen_q_;
 
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
-
-    for (int i = 0; i < p.host_seqstart_q.size(); i++)
-      p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
-
-    for (int i = 0; i < p.host_seqstart_k.size(); i++)
-      p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
+    at::Tensor dev_seqstart_q =
+        at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+    at::Tensor dev_seqstart_k =
+        at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+    at::Tensor dev_seqlen_k;
+
+    p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
+    HIP_CALL_CHECK(hipMemcpyAsync(
+        p.seqstart_q_dev_ptr,
+        seqstart_q->data_ptr(),
+        (p.num_batches + 1) * sizeof(int),
+        hipMemcpyHostToDevice,
+        stream));
+
+    p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
+    HIP_CALL_CHECK(hipMemcpyAsync(
+        p.seqstart_k_dev_ptr,
+        seqstart_k->data_ptr(),
+        (p.num_batches + 1) * sizeof(int),
+        hipMemcpyHostToDevice,
+        stream));
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
@@ -312,59 +320,18 @@ efficient_attention_forward_ck(
       TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
       CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
 
-      p.host_seqlen_k.resize(p.num_batches);
+      dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
 
-      for (int i = 0; i < p.host_seqlen_k.size(); i++)
-        p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
-    }
+      p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
 
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* attn_bias_ptr =
-        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-    for (int i = 0; i < p.num_batches; i++) {
-      size_t tmp_q_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
-          query.scalar_type());
-      size_t tmp_k_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
-          key.scalar_type());
-      size_t tmp_v_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
-          value.scalar_type());
-      size_t tmp_o_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
-          out.scalar_type());
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-
-      if (bias.has_value()) {
-        /*
-        size_t tmp_bias_offset = get_size_in_bytes(
-            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(
-            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-        */
-
-        throw std::runtime_error(
-            "bias is currently not supported by ck-tiled!");
-      };
-
-      // ToDO: remove this after dev-op fix
-      p.randvals_ptrs.push_back(nullptr);
-    }
+      HIP_CALL_CHECK(hipMemcpyAsync(
+          p.seqlen_k_dev_ptr,
+          seqstart_k->data_ptr(),
+          p.num_batches * sizeof(int),
+          hipMemcpyHostToDevice,
+          stream));
+    } else
+      p.seqlen_k_dev_ptr = nullptr;
 
     p.use_dropout = use_dropout;
     p.philox_seed = philox_seed;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h
deleted file mode 100644
index 2cb0d1aea..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_kernel.h
+++ /dev/null
@@ -1,220 +0,0 @@
-#pragma once
-
-#include "ck/tensor/tensor_view.hpp"
-#include "ck/tile_program/tile/tile_window.hpp"
-#include "ck/utility/common_header.hpp"
-
-// S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] * K[seqlen_k, hdim_q]
-// P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
-// O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
-
-#define C_LOG2E 1.44269504088896340736 // log2(e)
-
-template <
-    typename TilePartitioner_,
-    typename FmhaPipeline_,
-    typename EpiloguePipeline_>
-struct FmhaFwdKernel {
-  using TilePartitioner = ck::remove_cvref_t<TilePartitioner_>;
-  using FmhaPipeline = ck::remove_cvref_t<FmhaPipeline_>;
-  using EpiloguePipeline = ck::remove_cvref_t<EpiloguePipeline_>;
-  static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
-
-  using QDataType = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
-  using KDataType = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
-  using VDataType = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
-  using ODataType = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
-
-  using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
-
-  struct Kargs {
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    void* o_ptr;
-    ck::index_t seqlen_q;
-    ck::index_t seqlen_k;
-    ck::index_t hdim_q;
-    ck::index_t hdim_v;
-
-    float scale;
-
-    ck::index_t stride_q;
-    ck::index_t stride_k;
-    ck::index_t stride_v;
-    ck::index_t stride_o;
-
-    ck::index_t nhead_stride_q;
-    ck::index_t nhead_stride_k;
-    ck::index_t nhead_stride_v;
-    ck::index_t nhead_stride_o;
-
-    ck::index_t batch_stride_q;
-    ck::index_t batch_stride_k;
-    ck::index_t batch_stride_v;
-    ck::index_t batch_stride_o;
-  };
-
-  __host__ static constexpr Kargs MakeKargs(
-      const void* q_ptr,
-      const void* k_ptr,
-      const void* v_ptr,
-      void* o_ptr,
-      ck::index_t seqlen_q,
-      ck::index_t seqlen_k,
-      ck::index_t hdim_q,
-      ck::index_t hdim_v,
-      float scale,
-      ck::index_t stride_q,
-      ck::index_t stride_k,
-      ck::index_t stride_v,
-      ck::index_t stride_o,
-      ck::index_t nhead_stride_q,
-      ck::index_t nhead_stride_k,
-      ck::index_t nhead_stride_v,
-      ck::index_t nhead_stride_o,
-      ck::index_t batch_stride_q,
-      ck::index_t batch_stride_k,
-      ck::index_t batch_stride_v,
-      ck::index_t batch_stride_o) {
-    return Kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,
-                 seqlen_q,       seqlen_k,       hdim_q,         hdim_v,
-                 scale,          stride_q,       stride_k,       stride_v,
-                 stride_o,       nhead_stride_q, nhead_stride_k, nhead_stride_v,
-                 nhead_stride_o, batch_stride_q, batch_stride_k, batch_stride_v,
-                 batch_stride_o};
-  }
-
-  __host__ static constexpr auto GridSize(
-      ck::index_t batch_size_,
-      ck::index_t nhead_,
-      ck::index_t seqlen_q_,
-      ck::index_t hdim_v_) {
-    return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
-  }
-
-  __host__ static constexpr auto BlockSize() {
-    return dim3(kBlockSize);
-  }
-
-  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
-    return ck::math::max(
-        FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
-  }
-
-  __device__ void operator()(Kargs kargs) const {
-    using namespace ck;
-    using namespace ck::tile_program;
-    using namespace ck::tile_program::block;
-
-    // allocate LDS
-    __shared__ char smem_ptr[GetSmemSize()];
-
-    // divide problem
-    const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
-        TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
-
-    const index_t i_m0 =
-        __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-    const index_t i_n1 =
-        __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-    // for simplicity, batch stride we just modify the pointer
-    const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-        i_nhead * kargs.nhead_stride_q + i_batch * kargs.batch_stride_q;
-    const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-        i_nhead * kargs.nhead_stride_k + i_batch * kargs.batch_stride_k;
-    const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-        i_nhead * kargs.nhead_stride_v + i_batch * kargs.batch_stride_v;
-    ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-        i_nhead * kargs.nhead_stride_o + i_batch * kargs.batch_stride_o;
-
-    // Q/K/V DRAM and DRAM window
-    const auto q_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-        q_ptr,
-        make_tuple(kargs.seqlen_q, kargs.hdim_q),
-        make_tuple(kargs.stride_q, 1),
-        Number<32>{},
-        Number<1>{});
-
-    const auto k_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-        k_ptr,
-        make_tuple(kargs.seqlen_k, kargs.hdim_q),
-        make_tuple(kargs.stride_k, 1),
-        Number<32>{},
-        Number<1>{});
-
-    const auto v_dram = [&]() {
-      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
-        const auto v_dram_tmp =
-            make_naive_tensor_view<AddressSpaceEnum::Global>(
-                v_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                make_tuple(kargs.stride_v, 1),
-                Number<32>{},
-                Number<1>{});
-        return transform_tensor_view(
-            v_dram_tmp,
-            make_tuple(
-                make_pass_through_transform(kargs.hdim_v),
-                make_pass_through_transform(kargs.seqlen_k)),
-            make_tuple(Sequence<1>{}, Sequence<0>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-      } else {
-        return make_naive_tensor_view<AddressSpaceEnum::Global>(
-            v_ptr,
-            make_tuple(kargs.hdim_v, kargs.seqlen_k),
-            make_tuple(kargs.stride_v, 1),
-            Number<32>{},
-            Number<1>{});
-      }
-    }();
-
-    auto q_dram_window = make_tile_window(
-        q_dram,
-        [&]() {
-          if constexpr (FmhaPipeline::kQLoadOnce)
-            return make_tuple(
-                Number<FmhaPipeline::kM0>{},
-                Number<FmhaPipeline::kK0BlockLength>{});
-          else
-            return make_tuple(
-                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
-        }(),
-        {i_m0, 0});
-
-    auto k_dram_window = make_tile_window(
-        k_dram,
-        make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
-        {0, 0});
-
-    auto v_dram_window = make_tile_window(
-        v_dram,
-        make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-        {i_n1, 0});
-
-    auto o_acc_tile = FmhaPipeline{}(
-        q_dram_window,
-        k_dram_window,
-        v_dram_window,
-        kargs.scale,
-        kargs.seqlen_k / FmhaPipeline::kN0,
-        kargs.hdim_q / FmhaPipeline::kK0,
-        smem_ptr);
-
-    // O DRAM and O DRAM window
-    auto o_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-        o_ptr,
-        make_tuple(kargs.seqlen_q, kargs.hdim_v),
-        make_tuple(kargs.stride_o, 1),
-        Number<32>{},
-        Number<1>{});
-
-    auto o_dram_window = make_tile_window(
-        o_dram,
-        make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-        {i_m0, i_n1});
-
-    EpiloguePipeline{}(o_dram_window, o_acc_tile);
-  }
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 4b255f573..d6fa248bb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -1,14 +1,15 @@
 #pragma once
 
+#include <optional>
 #include <sstream>
 #include <stdexcept>
 
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/tensor/tensor_view.hpp"
-#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/utility/common_header.hpp"
+#include <ck/host_utility/device_prop.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
+#include <ck/tensor_description/cluster_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/utility/common_header.hpp>
 
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
@@ -17,8 +18,8 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
-#include "ck_fmha_params.h"
-#include "ck_tiled_fmha_batched_forward_kernel.h"
+#include "ck_tiled_fmha_params.h"
+#include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 
@@ -27,6 +28,7 @@ struct batched_infer_masktype_attnbias_dispatched {
   using QDataType = scalar_t;
   using KDataType = scalar_t;
   using VDataType = scalar_t;
+  using BiasDataType = scalar_t;
   using SaccDataType = float; // data type for first gemm accumulation
   using SMPLComputeDataType = float; // data type for reduction, softmax
   using PDataType = scalar_t; // data type for A matrix of second gemm
@@ -63,6 +65,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           VDataType,
           SaccDataType,
           SMPLComputeDataType,
+          BiasDataType,
           PDataType,
           OaccDataType,
           ODataType,
@@ -75,6 +78,7 @@ struct batched_infer_masktype_attnbias_dispatched {
           VDataType,
           SaccDataType,
           SMPLComputeDataType,
+          BiasDataType,
           PDataType,
           OaccDataType,
           ODataType,
@@ -126,6 +130,17 @@ struct batched_infer_masktype_attnbias_dispatched {
     constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
     constexpr ck::index_t kBlockPerCu = kWarpPerCu / kWarpPerBlock;
 
+    std::optional<
+        std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>>
+        bias;
+
+    if (param.has_attn_bias)
+      bias = std::make_tuple(
+          param.attn_bias_ptr,
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[0]);
+
     auto kargs = FmhaKernel::MakeKargs(
         param.q_ptr,
         param.k_ptr,
@@ -147,7 +162,8 @@ struct batched_infer_masktype_attnbias_dispatched {
         param.q_strides[0], // q, k, v, out tensor batch-dim stride
         param.k_strides[0],
         param.v_strides[0],
-        param.out_strides[0]);
+        param.out_strides[0],
+        bias);
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
new file mode 100644
index 000000000..334be84bb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <optional>
+#include <type_traits>
+
+#include "ck/tensor/tensor_view.hpp"
+#include "ck/tile_program/tile/tile_window.hpp"
+#include "ck/utility/common_header.hpp"
+
+// S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] * K[seqlen_k, hdim_q]
+// S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
+// S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
+// P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
+// O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
+
+#define C_LOG2E 1.44269504088896340736 // log2(e)
+
+template <
+    typename TilePartitioner_,
+    typename FmhaPipeline_,
+    typename EpiloguePipeline_>
+struct FmhaFwdKernel {
+  using TilePartitioner = ck::remove_cvref_t<TilePartitioner_>;
+  using FmhaPipeline = ck::remove_cvref_t<FmhaPipeline_>;
+  using EpiloguePipeline = ck::remove_cvref_t<EpiloguePipeline_>;
+  static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
+
+  using QDataType = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
+  using KDataType = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
+  using VDataType = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
+  using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
+  using ODataType = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
+
+  using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
+
+  struct KargsCommon {
+    const QDataType* q_ptr;
+    const KDataType* k_ptr;
+    const VDataType* v_ptr;
+    ODataType* o_ptr;
+
+    ck::index_t seqlen_q;
+    ck::index_t seqlen_k;
+    ck::index_t hdim_q;
+    ck::index_t hdim_v;
+
+    float scale;
+
+    ck::index_t stride_q;
+    ck::index_t stride_k;
+    ck::index_t stride_v;
+    ck::index_t stride_o;
+
+    ck::index_t nhead_stride_q;
+    ck::index_t nhead_stride_k;
+    ck::index_t nhead_stride_v;
+    ck::index_t nhead_stride_o;
+
+    // following attributes are optional
+    const BiasDataType* bias_ptr = nullptr;
+    ck::index_t stride_bias = 0;
+    ck::index_t nhead_stride_bias = 0;
+  };
+
+  struct KargsBatchMode : KargsCommon {
+    ck::index_t batch_stride_q;
+    ck::index_t batch_stride_k;
+    ck::index_t batch_stride_v;
+    ck::index_t batch_stride_o;
+
+    // following attributes are optional
+    ck::index_t batch_stride_bias = 0;
+  };
+
+  struct KargsGroupMode : KargsCommon {
+    const ck::index_t* seqstart_q_ptr;
+    const ck::index_t* seqstart_k_ptr;
+    const ck::index_t* seqlen_k_ptr;
+  };
+
+  __host__ static constexpr void InitKargsCommon(
+      KargsCommon& kargs,
+      const void* q_ptr,
+      const void* k_ptr,
+      const void* v_ptr,
+      void* o_ptr,
+      ck::index_t seqlen_q,
+      ck::index_t seqlen_k,
+      ck::index_t hdim_q,
+      ck::index_t hdim_v,
+      float scale,
+      ck::index_t stride_q,
+      ck::index_t stride_k,
+      ck::index_t stride_v,
+      ck::index_t stride_o,
+      ck::index_t nhead_stride_q,
+      ck::index_t nhead_stride_k,
+      ck::index_t nhead_stride_v,
+      ck::index_t nhead_stride_o) {
+    kargs.q_ptr = reinterpret_cast<const QDataType*>(q_ptr);
+    kargs.k_ptr = reinterpret_cast<const KDataType*>(k_ptr);
+    kargs.v_ptr = reinterpret_cast<const VDataType*>(v_ptr);
+    kargs.o_ptr = reinterpret_cast<ODataType*>(o_ptr);
+
+    kargs.seqlen_q = seqlen_q;
+    kargs.seqlen_k = seqlen_k;
+    kargs.hdim_q = hdim_q;
+    kargs.hdim_v = hdim_v;
+
+    kargs.scale = scale;
+
+    kargs.stride_q = stride_q;
+    kargs.stride_k = stride_k;
+    kargs.stride_v = stride_v;
+    kargs.stride_o = stride_o;
+
+    kargs.nhead_stride_q = nhead_stride_q;
+    kargs.nhead_stride_k = nhead_stride_k;
+    kargs.nhead_stride_v = nhead_stride_v;
+    kargs.nhead_stride_o = nhead_stride_o;
+  }
+
+  __host__ static constexpr void InitKargsCommonBias(
+      KargsCommon& kargs,
+      const void* bias_ptr,
+      ck::index_t stride_bias,
+      ck::index_t nhead_stride_bias) {
+    kargs.bias_ptr = reinterpret_cast<const BiasDataType*>(bias_ptr);
+    kargs.stride_bias = stride_bias;
+    kargs.nhead_stride_bias = nhead_stride_bias;
+  }
+
+  // initialize kernel arguments for batch mode
+  __host__ static constexpr auto MakeKargs(
+      const void* q_ptr,
+      const void* k_ptr,
+      const void* v_ptr,
+      void* o_ptr,
+      ck::index_t seqlen_q,
+      ck::index_t seqlen_k,
+      ck::index_t hdim_q,
+      ck::index_t hdim_v,
+      float scale,
+      ck::index_t stride_q,
+      ck::index_t stride_k,
+      ck::index_t stride_v,
+      ck::index_t stride_o,
+      ck::index_t nhead_stride_q,
+      ck::index_t nhead_stride_k,
+      ck::index_t nhead_stride_v,
+      ck::index_t nhead_stride_o,
+      ck::index_t batch_stride_q,
+      ck::index_t batch_stride_k,
+      ck::index_t batch_stride_v,
+      ck::index_t batch_stride_o,
+      std::optional<
+          std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias =
+          std::nullopt) {
+    KargsBatchMode kargs;
+
+    InitKargsCommon(
+        kargs,
+        q_ptr,
+        k_ptr,
+        v_ptr,
+        o_ptr,
+        seqlen_q,
+        seqlen_k,
+        hdim_q,
+        hdim_v,
+        scale,
+        stride_q,
+        stride_k,
+        stride_v,
+        stride_o,
+        nhead_stride_q,
+        nhead_stride_k,
+        nhead_stride_v,
+        nhead_stride_o);
+
+    kargs.batch_stride_q = batch_stride_q;
+    kargs.batch_stride_k = batch_stride_k;
+    kargs.batch_stride_v = batch_stride_v;
+    kargs.batch_stride_o = batch_stride_o;
+
+    if (bias.has_value()) {
+      InitKargsCommonBias(
+          kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
+
+      kargs.batch_stride_bias = std::get<3>(*bias);
+    }
+
+    return kargs;
+  }
+
+  // initialize kernel arguments for group mode
+  __host__ static constexpr auto MakeKargs(
+      const void* q_ptr,
+      const void* k_ptr,
+      const void* v_ptr,
+      void* o_ptr,
+      const void* seqstart_q_ptr,
+      const void* seqstart_k_ptr,
+      const void* seqlen_k_ptr,
+      ck::index_t hdim_q,
+      ck::index_t hdim_v,
+      float scale,
+      ck::index_t stride_q,
+      ck::index_t stride_k,
+      ck::index_t stride_v,
+      ck::index_t stride_o,
+      ck::index_t nhead_stride_q,
+      ck::index_t nhead_stride_k,
+      ck::index_t nhead_stride_v,
+      ck::index_t nhead_stride_o,
+      std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias =
+          std::nullopt) {
+    KargsGroupMode kargs;
+
+    InitKargsCommon(
+        kargs,
+        q_ptr,
+        k_ptr,
+        v_ptr,
+        o_ptr,
+        -1, // seqlen_q will be updated inside the kernel
+        -1, // seqlen_k will be updated inside the kernel
+        hdim_q,
+        hdim_v,
+        scale,
+        stride_q,
+        stride_k,
+        stride_v,
+        stride_o,
+        nhead_stride_q,
+        nhead_stride_k,
+        nhead_stride_v,
+        nhead_stride_o);
+
+    if (bias.has_value()) {
+      InitKargsCommonBias(
+          kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
+    }
+
+    kargs.seqstart_q_ptr = reinterpret_cast<const ck::index_t*>(seqstart_q_ptr);
+    kargs.seqstart_k_ptr = reinterpret_cast<const ck::index_t*>(seqstart_k_ptr);
+    kargs.seqlen_k_ptr = reinterpret_cast<const ck::index_t*>(seqlen_k_ptr);
+
+    return kargs;
+  }
+
+  __host__ static constexpr auto GridSize(
+      ck::index_t batch_size_,
+      ck::index_t nhead_,
+      ck::index_t seqlen_q_,
+      ck::index_t hdim_v_) {
+    return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
+  }
+
+  __host__ static constexpr auto BlockSize() {
+    return dim3(kBlockSize);
+  }
+
+  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
+    return ck::math::max(
+        FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+  }
+
+  template <typename Kargs>
+  __device__ void operator()(Kargs kargs) const {
+    using namespace ck;
+    using namespace ck::tile_program;
+    using namespace ck::tile_program::block;
+
+    // allocate LDS
+    __shared__ char smem_ptr[GetSmemSize()];
+
+    // divide problem
+    const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
+        TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
+
+    const index_t i_m0 =
+        __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+    const index_t i_n1 =
+        __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+    index_t batch_offset_q = 0;
+    index_t batch_offset_k = 0;
+    index_t batch_offset_v = 0;
+    index_t batch_offset_bias = 0;
+    index_t batch_offset_o = 0;
+
+    if constexpr (is_same_v<Kargs, KargsBatchMode>) {
+      batch_offset_q = i_batch * kargs.batch_stride_q;
+      batch_offset_k = i_batch * kargs.batch_stride_k;
+      batch_offset_v = i_batch * kargs.batch_stride_v;
+      batch_offset_bias = i_batch * kargs.batch_stride_bias;
+      batch_offset_o = i_batch * kargs.batch_stride_o;
+    } else { // is_same_v<Kargs, KargsGroupMode>
+      // get starting offset for each work batch
+      const index_t query_start = kargs.seqstart_q_ptr[i_batch];
+      const index_t key_start = kargs.seqstart_k_ptr[i_batch];
+
+      batch_offset_q = query_start * kargs.stride_q;
+      batch_offset_k = key_start * kargs.stride_k;
+      batch_offset_v = key_start * kargs.stride_v;
+      batch_offset_bias = query_start * kargs.stride_bias + key_start;
+      batch_offset_o = query_start * kargs.stride_o;
+
+      // get real # queries & # keys under group mode
+      const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+      kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+      if (kargs.seqlen_k_ptr != nullptr) {
+        kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+      } else {
+        const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+        kargs.seqlen_k =
+            adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+      }
+    }
+
+    // for simplicity, batch stride we just modify the pointer
+    const QDataType* q_ptr =
+        kargs.q_ptr + i_nhead * kargs.nhead_stride_q + batch_offset_q;
+    const KDataType* k_ptr =
+        kargs.k_ptr + i_nhead * kargs.nhead_stride_k + batch_offset_k;
+    const VDataType* v_ptr =
+        kargs.v_ptr + i_nhead * kargs.nhead_stride_v + batch_offset_v;
+    const BiasDataType* bias_ptr = nullptr;
+    if (kargs.bias_ptr != nullptr) {
+      bias_ptr = kargs.bias_ptr + i_nhead * kargs.nhead_stride_bias +
+          batch_offset_bias;
+    }
+    ODataType* o_ptr =
+        kargs.o_ptr + i_nhead * kargs.nhead_stride_o + batch_offset_o;
+
+    // Q/K/V DRAM and DRAM window
+    const auto q_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+        q_ptr,
+        make_tuple(kargs.seqlen_q, kargs.hdim_q),
+        make_tuple(kargs.stride_q, 1),
+        Number<32>{},
+        Number<1>{});
+
+    const auto k_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+        k_ptr,
+        make_tuple(kargs.seqlen_k, kargs.hdim_q),
+        make_tuple(kargs.stride_k, 1),
+        Number<32>{},
+        Number<1>{});
+
+    const auto v_dram = [&]() {
+      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
+        const auto v_dram_tmp =
+            make_naive_tensor_view<AddressSpaceEnum::Global>(
+                v_ptr,
+                make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                make_tuple(kargs.stride_v, 1),
+                Number<32>{},
+                Number<1>{});
+        return transform_tensor_view(
+            v_dram_tmp,
+            make_tuple(
+                make_pass_through_transform(kargs.hdim_v),
+                make_pass_through_transform(kargs.seqlen_k)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+      } else {
+        return make_naive_tensor_view<AddressSpaceEnum::Global>(
+            v_ptr,
+            make_tuple(kargs.hdim_v, kargs.seqlen_k),
+            make_tuple(kargs.stride_v, 1),
+            Number<32>{},
+            Number<1>{});
+      }
+    }();
+
+    auto q_dram_window = make_tile_window(
+        q_dram,
+        [&]() {
+          if constexpr (FmhaPipeline::kQLoadOnce)
+            return make_tuple(
+                Number<FmhaPipeline::kM0>{},
+                Number<FmhaPipeline::kK0BlockLength>{});
+          else
+            return make_tuple(
+                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
+        }(),
+        {i_m0, 0});
+
+    auto k_dram_window = make_tile_window(
+        k_dram,
+        make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
+        {0, 0});
+
+    auto v_dram_window = make_tile_window(
+        v_dram,
+        make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+        {i_n1, 0});
+
+    const auto run_pipeline_with = [&](auto bias_dram_window) {
+      return FmhaPipeline{}(
+          q_dram_window,
+          k_dram_window,
+          v_dram_window,
+          bias_dram_window,
+          kargs.scale,
+          kargs.seqlen_k / FmhaPipeline::kN0,
+          kargs.hdim_q / FmhaPipeline::kK0,
+          smem_ptr);
+    };
+
+    auto o_acc_tile = [&]() {
+      constexpr auto bias_dram_window_lengths =
+          make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
+
+      if (bias_ptr != nullptr) {
+        const auto bias_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+            bias_ptr,
+            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+            make_tuple(kargs.stride_bias, 1),
+            Number<32>{},
+            Number<1>{});
+
+        auto bias_dram_window =
+            make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+
+        return run_pipeline_with(bias_dram_window);
+      } else {
+        auto dummy_bias_dram_window =
+            make_null_tile_window(bias_dram_window_lengths);
+
+        return run_pipeline_with(dummy_bias_dram_window);
+      }
+    }();
+
+    // O DRAM and O DRAM window
+    auto o_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
+        o_ptr,
+        make_tuple(kargs.seqlen_q, kargs.hdim_v),
+        make_tuple(kargs.stride_o, 1),
+        Number<32>{},
+        Number<1>{});
+
+    auto o_dram_window = make_tile_window(
+        o_dram,
+        make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
+        {i_m0, i_n1});
+
+    EpiloguePipeline{}(o_dram_window, o_acc_tile);
+  }
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index f52884e27..478e603ea 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -1,21 +1,175 @@
 #pragma once
 
+#include <optional>
 #include <sstream>
 #include <stdexcept>
 
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck/host_utility/device_prop.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
+#include <ck/tensor_description/cluster_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/utility/common_header.hpp>
 
-#include "ck_fmha_params.h"
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs_default_policy.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+
+#include "ck_fmha_op_helper.h"
+#include "ck_fmha_util.h"
+#include "ck_tiled_fmha_forward_kernel.h"
+#include "ck_tiled_fmha_fwd_epilogue.h"
+#include "ck_tiled_fmha_fwd_tile_partitioner.h"
+#include "ck_tiled_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 struct grouped_infer_masktype_attnbias_dispatched {
-  static void Run(GroupedForwardParams& param, hipStream_t stream){};
+  using QDataType = scalar_t;
+  using KDataType = scalar_t;
+  using VDataType = scalar_t;
+  using BiasDataType = scalar_t;
+  using SaccDataType = float; // data type for first gemm accumulation
+  using SMPLComputeDataType = float; // data type for reduction, softmax
+  using PDataType = scalar_t; // data type for A matrix of second gemm
+  using OaccDataType = float; // data type for second gemm accumulation
+  using ODataType = scalar_t;
+
+  using VLayout = ck::tensor_layout::gemm::RowMajor;
+
+  using FmhaBlockTileHdim64 = ck::Sequence<128, 64, 32, 64, 32, 64>;
+  using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
+  using FmhaBlockWarps = ck::Sequence<4, 1, 1>;
+  using FmhaWarpTile = ck::Sequence<32, 32, 16>;
+  using FmhaShapeHDim64 = ck::tile_program::TileFmhaShape<
+      FmhaBlockTileHdim64,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      VLayout>;
+  using FmhaShapeHDim128 = ck::tile_program::TileFmhaShape<
+      FmhaBlockTileHdim128,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      FmhaBlockWarps,
+      FmhaWarpTile,
+      VLayout>;
+
+  using FmhaTilePartitionerHDim64 = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
+  using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
+  using FmhaPipelineProblemHDim64 =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          QDataType,
+          KDataType,
+          VDataType,
+          SaccDataType,
+          SMPLComputeDataType,
+          BiasDataType,
+          PDataType,
+          OaccDataType,
+          ODataType,
+          256, // BlockSize
+          FmhaShapeHDim64>;
+  using FmhaPipelineProblemHDim128 =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          QDataType,
+          KDataType,
+          VDataType,
+          SaccDataType,
+          SMPLComputeDataType,
+          BiasDataType,
+          PDataType,
+          OaccDataType,
+          ODataType,
+          256, // BlockSize
+          FmhaShapeHDim128>;
+
+  using FmhaPipelineHDim64 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+      FmhaPipelineProblemHDim64>;
+  using FmhaPipelineHDim128 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+      FmhaPipelineProblemHDim128>;
+
+  using FmhaEpilogue =
+      FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
+  using FmhaKernelHDim64 = FmhaFwdKernel<
+      FmhaTilePartitionerHDim64,
+      FmhaPipelineHDim64,
+      FmhaEpilogue>;
+  using FmhaKernelHDim128 = FmhaFwdKernel<
+      FmhaTilePartitionerHDim128,
+      FmhaPipelineHDim128,
+      FmhaEpilogue>;
+
+#ifndef GROUPED_INFER_HEADDIM_SWITCH
+#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)  \
+  [&] {                                                          \
+    if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64) {             \
+      using FmhaKernel = FmhaKernelHDim64;                       \
+      __VA_ARGS__();                                             \
+    } else if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128) {     \
+      using FmhaKernel = FmhaKernelHDim128;                      \
+      __VA_ARGS__();                                             \
+    } else {                                                     \
+      throw std::runtime_error("Head-dim sizes not supported!"); \
+    }                                                            \
+  }()
+#endif
+
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    GROUPED_INFER_HEADDIM_SWITCH(
+        param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
+  };
+
+  template <typename FmhaKernel>
+  static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream) {
+    dim3 kGridSize = FmhaKernel::GridSize(1, param.Hq, param.M, param.Kv);
+    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+
+    constexpr ck::index_t kWarpPerCu = 8; // 2 warps per SIMD
+    constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
+    constexpr ck::index_t kBlockPerCu = kWarpPerCu / kWarpPerBlock;
+
+    std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias;
+
+    if (param.has_attn_bias) {
+      bias = std::make_tuple(
+          param.attn_bias_ptr,
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[1]);
+    };
+
+    auto kargs = FmhaKernel::MakeKargs(
+        param.q_ptr,
+        param.k_ptr,
+        param.v_ptr,
+        param.out_ptr,
+        param.seqstart_q_dev_ptr,
+        param.seqstart_k_dev_ptr,
+        param.seqlen_k_dev_ptr,
+        param.K, // hdim_q
+        param.Kv, // hdim_v
+        param.scale,
+        param.q_strides[1], // q, k, v, out tensor seq-dim stride
+        param.k_strides[1],
+        param.v_strides[1],
+        param.out_strides[1],
+        param.q_strides[2], // q, k, v, out tensor head-dim stride
+        param.k_strides[2],
+        param.v_strides[2],
+        param.out_strides[2],
+        bias);
 
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(
-      GroupedForwardParams& param,
-      hipStream_t stream){};
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
new file mode 100644
index 000000000..e07f711ac
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+struct BatchedInferParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+
+  // BMHK mode strides
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> out_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* out_ptr;
+};
+
+struct BatchedForwardParams : public BatchedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // completely contiguous
+  void* logsumexp_ptr;
+};
+
+struct GroupedInferParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  int max_seqlen_q;
+
+  void* seqstart_q_dev_ptr;
+  void* seqstart_k_dev_ptr;
+  void* seqlen_k_dev_ptr;
+
+  float scale;
+  bool has_attn_bias;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* out_ptr;
+};
+
+struct GroupedForwardParams : public GroupedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // completely contiguous
+  std::vector<void*> logsumexp_ptrs;
+
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
+};
+
+struct BatchedBackwardParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+  bool bias_has_grad;
+
+  bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
+
+  // BMHK mode strides, last-dim contiguous
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+  std::array<int, 4> out_strides;
+
+  std::array<int, 4> tmp_grad_k_strides;
+  std::array<int, 4> tmp_grad_v_strides;
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+  const void* grad_out_ptr;
+  const void* out_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* grad_q_ptr;
+  void* grad_k_ptr;
+  void* grad_v_ptr;
+  void* grad_bias_ptr;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // BHM mode lengths, completely contiguous
+  const void* logsumexp_ptr;
+};
+
+struct GroupedBackwardParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  int max_seqlen_q;
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+  bool has_attn_bias;
+  bool bias_has_grad;
+
+  bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::array<int, 3> tmp_grad_k_strides;
+  std::array<int, 3> tmp_grad_v_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<const void*> grad_out_ptrs;
+  std::vector<const void*> out_ptrs;
+
+  // used by the light_v2 kernel
+  // TODO use these as workspace
+  std::vector<void*> ydotdy_ptrs;
+
+  uint8_t custom_mask_type;
+
+  std::vector<void*> grad_q_ptrs;
+  std::vector<void*> grad_k_ptrs;
+  std::vector<void*> grad_v_ptrs;
+  std::vector<void*> grad_bias_ptrs;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // BHM mode lengths, completely contiguous
+  std::vector<const void*> logsumexp_ptrs;
+
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
+};

From 17ca15e11447b46beb9aaedf82fa08bf59f08a4d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 27 Nov 2023 17:53:30 +0000
Subject: [PATCH 242/641] Remove the using of has_attn_bias as template for
 ck-tiled infer

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 12 ++---
 .../ck_tiled_fmha_batched_infer_fp16.cpp      | 54 +++++--------------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 10 ++--
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      | 54 +++++--------------
 ...led_fmha_batched_infer_fp16_masktype_0.cpp |  7 +++
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |  8 ---
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |  8 ---
 ...led_fmha_batched_infer_fp16_masktype_1.cpp |  7 +++
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |  8 ---
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |  8 ---
 ...led_fmha_batched_infer_fp16_masktype_2.cpp |  7 +++
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |  8 ---
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |  8 ---
 ...led_fmha_grouped_infer_fp16_masktype_0.cpp |  7 +++
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |  8 ---
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |  8 ---
 ...led_fmha_grouped_infer_fp16_masktype_1.cpp |  7 +++
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |  8 ---
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |  8 ---
 ...led_fmha_grouped_infer_fp16_masktype_2.cpp |  7 +++
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |  8 ---
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |  8 ---
 22 files changed, 79 insertions(+), 189 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index d6fa248bb..543a7ac7f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -18,12 +18,12 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
-#include "ck_tiled_fmha_params.h"
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
+#include "ck_tiled_fmha_params.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+template <typename scalar_t, int32_t custom_mask_type>
 struct batched_infer_masktype_attnbias_dispatched {
   using QDataType = scalar_t;
   using KDataType = scalar_t;
@@ -175,12 +175,10 @@ struct batched_infer_masktype_attnbias_dispatched {
   };
 };
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+template <typename scalar_t, int32_t custom_mask_type>
 void run_batched_infer_masktype_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
-  batched_infer_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
+  batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(
+      param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 5814b7391..bb4fa6d91 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -7,52 +7,26 @@
 
 extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+    0>(BatchedForwardParams& param, hipStream_t stream);
 
 extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+    1>(BatchedForwardParams& param, hipStream_t stream);
 
 extern template void run_batched_infer_masktype_attnbias_dispatched<
     ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+    2>(BatchedForwardParams& param, hipStream_t stream);
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+  if (param.custom_mask_type == 0)
+    run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
+        param, stream);
+  else if (param.custom_mask_type == 1)
+    run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
+        param, stream);
+  else if (param.custom_mask_type == 2)
+    run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
+        param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 478e603ea..b58bcfafb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -25,7 +25,7 @@
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+template <typename scalar_t, int32_t custom_mask_type>
 struct grouped_infer_masktype_attnbias_dispatched {
   using QDataType = scalar_t;
   using KDataType = scalar_t;
@@ -172,12 +172,10 @@ struct grouped_infer_masktype_attnbias_dispatched {
   };
 };
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
+template <typename scalar_t, int32_t custom_mask_type>
 void run_grouped_infer_masktype_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
-  grouped_infer_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
+  grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(
+      param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 009571c97..3954ee4ff 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -7,52 +7,26 @@
 
 extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+    0>(GroupedForwardParams& param, hipStream_t stream);
 
 extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+    1>(GroupedForwardParams& param, hipStream_t stream);
 
 extern template void run_grouped_infer_masktype_attnbias_dispatched<
     ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+    2>(GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+  if (param.custom_mask_type == 0)
+    run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
+        param, stream);
+  else if (param.custom_mask_type == 1)
+    run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
+        param, stream);
+  else if (param.custom_mask_type == 2)
+    run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
+        param, stream);
+  else
+    throw std::runtime_error("Invalid custom_mask_type value");
 };
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp
new file mode 100644
index 000000000..2915b07ed
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
+    BatchedForwardParams& param,
+    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index e9959f237..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 6c46ed45f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp
new file mode 100644
index 000000000..8d7f2bbf8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
+    BatchedForwardParams& param,
+    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index aefdd2804..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 61b94d6ad..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp
new file mode 100644
index 000000000..b608b8939
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
+    BatchedForwardParams& param,
+    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 720a9c2fc..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 75daaaa07..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp
new file mode 100644
index 000000000..8117f8b58
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
+    GroupedForwardParams& param,
+    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 96d0f992e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index adeee9880..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp
new file mode 100644
index 000000000..d1b93e583
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
+    GroupedForwardParams& param,
+    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index f3843a8ed..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index bae1535a3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp
new file mode 100644
index 000000000..246b90a77
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
+    GroupedForwardParams& param,
+    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 768082654..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index ac11a4eea..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);

From 0cf0d3df720fdee6578476c0fc895c046f53cd96 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 29 Nov 2023 16:00:16 +0000
Subject: [PATCH 243/641] Add clang-format file to control clang-format-10

---
 .clang-format | 80 ++++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/.clang-format b/.clang-format
index 6d0ab740d..22f267496 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,80 +1,81 @@
 ---
-AccessModifierOffset: -1
-AlignAfterOpenBracket: AlwaysBreak
-AlignConsecutiveAssignments: false
+Language:        Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlinesLeft: true
-AlignOperands:   false
-AlignTrailingComments: false
-AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Empty
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakBeforeMultilineStrings: false
 AlwaysBreakTemplateDeclarations: true
 BinPackArguments: false
 BinPackParameters: false
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
+BraceWrapping:   
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
   AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  BeforeCatch:     false
-  BeforeElse:      false
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      true
+  BeforeCatch:     true
+  BeforeElse:      true
   IndentBraces:    false
 BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
+BreakBeforeBraces: Custom
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: false
-ColumnLimit:     80
+ColumnLimit:     100
 CommentPragmas:  '^ IWYU pragma:'
-#CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
-ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
-IncludeCategories:
-  - Regex:           '^<.*\.h(pp)?>'
-    Priority:        1
-  - Regex:           '^<.*'
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
     Priority:        2
-  - Regex:           '.*'
+  - Regex:           '^(<|"(gtest|isl|json)/)'
     Priority:        3
-IndentCaseLabels: true
-IndentWidth:     2
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
 IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
+KeepEmptyLinesAtTheStartOfBlocks: true
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 2000000
+PenaltyReturnTypeOnItsOwnLine: 60
 PointerAlignment: Left
 ReflowComments:  true
-SortIncludes:    true
+SortIncludes:    false
 SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
+SpaceBeforeParens: Never
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 1
 SpacesInAngles:  false
@@ -86,3 +87,4 @@ Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
 ...
+

From 00a407069b53daab1416059c658e6852e83cdb88 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 29 Nov 2023 17:41:55 +0000
Subject: [PATCH 244/641] Update to have ck-tiled group mode pass the
 unit-tests

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 297 +++---
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 931 ++++++++++--------
 .../hip_fmha/ck_tiled_fmha_fwd_epilogue.h     |  40 +-
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |  83 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 285 +++---
 5 files changed, 851 insertions(+), 785 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 543a7ac7f..4f8598d7c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -4,12 +4,12 @@
 #include <sstream>
 #include <stdexcept>
 
+#include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
 #include <ck/tensor_description/cluster_descriptor.hpp>
 #include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
+#include <ck/tensor/tensor_view.hpp>
 
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
@@ -24,161 +24,154 @@
 #include "ck_tiled_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type>
-struct batched_infer_masktype_attnbias_dispatched {
-  using QDataType = scalar_t;
-  using KDataType = scalar_t;
-  using VDataType = scalar_t;
-  using BiasDataType = scalar_t;
-  using SaccDataType = float; // data type for first gemm accumulation
-  using SMPLComputeDataType = float; // data type for reduction, softmax
-  using PDataType = scalar_t; // data type for A matrix of second gemm
-  using OaccDataType = float; // data type for second gemm accumulation
-  using ODataType = scalar_t;
-
-  using VLayout = ck::tensor_layout::gemm::RowMajor;
-
-  using FmhaBlockTileHdim64 = ck::Sequence<128, 64, 32, 64, 32, 64>;
-  using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
-  using FmhaBlockWarps = ck::Sequence<4, 1, 1>;
-  using FmhaWarpTile = ck::Sequence<32, 32, 16>;
-  using FmhaShapeHDim64 = ck::tile_program::TileFmhaShape<
-      FmhaBlockTileHdim64,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      VLayout>;
-  using FmhaShapeHDim128 = ck::tile_program::TileFmhaShape<
-      FmhaBlockTileHdim128,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      VLayout>;
-
-  using FmhaTilePartitionerHDim64 = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
-  using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
-  using FmhaPipelineProblemHDim64 =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          QDataType,
-          KDataType,
-          VDataType,
-          SaccDataType,
-          SMPLComputeDataType,
-          BiasDataType,
-          PDataType,
-          OaccDataType,
-          ODataType,
-          256, // BlockSize
-          FmhaShapeHDim64>;
-  using FmhaPipelineProblemHDim128 =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          QDataType,
-          KDataType,
-          VDataType,
-          SaccDataType,
-          SMPLComputeDataType,
-          BiasDataType,
-          PDataType,
-          OaccDataType,
-          ODataType,
-          256, // BlockSize
-          FmhaShapeHDim128>;
-
-  using FmhaPipelineHDim64 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-      FmhaPipelineProblemHDim64>;
-  using FmhaPipelineHDim128 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-      FmhaPipelineProblemHDim128>;
-
-  using FmhaEpilogue =
-      FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
-  using FmhaKernelHDim64 = FmhaFwdKernel<
-      FmhaTilePartitionerHDim64,
-      FmhaPipelineHDim64,
-      FmhaEpilogue>;
-  using FmhaKernelHDim128 = FmhaFwdKernel<
-      FmhaTilePartitionerHDim128,
-      FmhaPipelineHDim128,
-      FmhaEpilogue>;
+struct batched_infer_masktype_attnbias_dispatched
+{
+    using QDataType           = scalar_t;
+    using KDataType           = scalar_t;
+    using VDataType           = scalar_t;
+    using BiasDataType        = scalar_t;
+    using SaccDataType        = float;    // data type for first gemm accumulation
+    using SMPLComputeDataType = float;    // data type for reduction, softmax
+    using PDataType           = scalar_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;    // data type for second gemm accumulation
+    using ODataType           = scalar_t;
+
+    using VLayout = ck::tensor_layout::gemm::RowMajor;
+
+    using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
+    using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
+    using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
+    using FmhaWarpTile         = ck::Sequence<32, 32, 16>;
+    using FmhaShapeHDim64      = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim64,
+                                                            FmhaBlockWarps,
+                                                            FmhaWarpTile,
+                                                            FmhaBlockWarps,
+                                                            FmhaWarpTile,
+                                                            VLayout>;
+    using FmhaShapeHDim128     = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim128,
+                                                             FmhaBlockWarps,
+                                                             FmhaWarpTile,
+                                                             FmhaBlockWarps,
+                                                             FmhaWarpTile,
+                                                             VLayout>;
+
+    using FmhaTilePartitionerHDim64  = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
+    using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
+    using FmhaPipelineProblemHDim64 =
+        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                          KDataType,
+                                                          VDataType,
+                                                          SaccDataType,
+                                                          SMPLComputeDataType,
+                                                          BiasDataType,
+                                                          PDataType,
+                                                          OaccDataType,
+                                                          ODataType,
+                                                          256, // BlockSize
+                                                          FmhaShapeHDim64>;
+    using FmhaPipelineProblemHDim128 =
+        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                          KDataType,
+                                                          VDataType,
+                                                          SaccDataType,
+                                                          SMPLComputeDataType,
+                                                          BiasDataType,
+                                                          PDataType,
+                                                          OaccDataType,
+                                                          ODataType,
+                                                          256, // BlockSize
+                                                          FmhaShapeHDim128>;
+
+    using FmhaPipelineHDim64 =
+        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim64>;
+    using FmhaPipelineHDim128 =
+        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim128>;
+
+    using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
+
+    // ToDo: define NeedPadding according to runtime lengths
+    static constexpr bool NeedPadding = true;
+
+    using FmhaKernelHDim64 =
+        FmhaFwdKernel<FmhaTilePartitionerHDim64, FmhaPipelineHDim64, FmhaEpilogue, NeedPadding>;
+    using FmhaKernelHDim128 =
+        FmhaFwdKernel<FmhaTilePartitionerHDim128, FmhaPipelineHDim128, FmhaEpilogue, NeedPadding>;
 
 #ifndef BATCHED_INFER_HEADDIM_SWITCH
-#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)  \
-  [&] {                                                          \
-    if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64) {             \
-      using FmhaKernel = FmhaKernelHDim64;                       \
-      __VA_ARGS__();                                             \
-    } else if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128) {     \
-      using FmhaKernel = FmhaKernelHDim128;                      \
-      __VA_ARGS__();                                             \
-    } else {                                                     \
-      throw std::runtime_error("Head-dim sizes not supported!"); \
-    }                                                            \
-  }()
+#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)        \
+    [&] {                                                              \
+        if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64)                  \
+        {                                                              \
+            using FmhaKernel = FmhaKernelHDim64;                       \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128)            \
+        {                                                              \
+            using FmhaKernel = FmhaKernelHDim128;                      \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else                                                           \
+        {                                                              \
+            throw std::runtime_error("Head-dim sizes not supported!"); \
+        }                                                              \
+    }()
 #endif
 
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    BATCHED_INFER_HEADDIM_SWITCH(
-        param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
-  };
-
-  template <typename FmhaKernel>
-  static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream) {
-    dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
-    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-
-    constexpr ck::index_t kWarpPerCu = 8; // 2 warps per SIMD
-    constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
-    constexpr ck::index_t kBlockPerCu = kWarpPerCu / kWarpPerBlock;
-
-    std::optional<
-        std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>>
-        bias;
-
-    if (param.has_attn_bias)
-      bias = std::make_tuple(
-          param.attn_bias_ptr,
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[0]);
-
-    auto kargs = FmhaKernel::MakeKargs(
-        param.q_ptr,
-        param.k_ptr,
-        param.v_ptr,
-        param.out_ptr,
-        param.M, // seqlen_q
-        param.N, // seqlen_k
-        param.K, // hdim_q
-        param.Kv, // hdim_v
-        param.scale,
-        param.q_strides[1], // q, k, v, out tensor seq-dim stride
-        param.k_strides[1],
-        param.v_strides[1],
-        param.out_strides[1],
-        param.q_strides[2], // q, k, v, out tensor head-dim stride
-        param.k_strides[2],
-        param.v_strides[2],
-        param.out_strides[2],
-        param.q_strides[0], // q, k, v, out tensor batch-dim stride
-        param.k_strides[0],
-        param.v_strides[0],
-        param.out_strides[0],
-        bias);
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
-  };
+    static void Run(BatchedForwardParams& param, hipStream_t stream)
+    {
+        BATCHED_INFER_HEADDIM_SWITCH(
+            param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
+    };
+
+    template <typename FmhaKernel>
+    static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream)
+    {
+        dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
+        constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+
+        constexpr ck::index_t kWarpPerCu    = 8; // 2 warps per SIMD
+        constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
+        constexpr ck::index_t kBlockPerCu   = kWarpPerCu / kWarpPerBlock;
+
+        std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias;
+
+        if(param.has_attn_bias)
+            bias = std::make_tuple(param.attn_bias_ptr,
+                                   param.attn_bias_strides[2],
+                                   param.attn_bias_strides[1],
+                                   param.attn_bias_strides[0]);
+
+        auto kargs =
+            FmhaKernel::MakeKargs(param.q_ptr,
+                                  param.k_ptr,
+                                  param.v_ptr,
+                                  param.out_ptr,
+                                  param.M,  // seqlen_q
+                                  param.N,  // seqlen_k
+                                  param.K,  // hdim_q
+                                  param.Kv, // hdim_v
+                                  param.scale,
+                                  param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                                  param.k_strides[1],
+                                  param.v_strides[1],
+                                  param.out_strides[1],
+                                  param.q_strides[2], // q, k, v, out tensor head-dim stride
+                                  param.k_strides[2],
+                                  param.v_strides[2],
+                                  param.out_strides[2],
+                                  param.q_strides[0], // q, k, v, out tensor batch-dim stride
+                                  param.k_strides[0],
+                                  param.v_strides[0],
+                                  param.out_strides[0],
+                                  bias);
+
+        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
+    };
 };
 
 template <typename scalar_t, int32_t custom_mask_type>
-void run_batched_infer_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream) {
-  batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(
-      param, stream);
+void run_batched_infer_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
+{
+    batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 334be84bb..9759c9832 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -1,6 +1,3 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
 #pragma once
 
 #include <optional>
@@ -18,439 +15,517 @@
 
 #define C_LOG2E 1.44269504088896340736 // log2(e)
 
-template <
-    typename TilePartitioner_,
-    typename FmhaPipeline_,
-    typename EpiloguePipeline_>
-struct FmhaFwdKernel {
-  using TilePartitioner = ck::remove_cvref_t<TilePartitioner_>;
-  using FmhaPipeline = ck::remove_cvref_t<FmhaPipeline_>;
-  using EpiloguePipeline = ck::remove_cvref_t<EpiloguePipeline_>;
-  static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
-
-  using QDataType = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
-  using KDataType = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
-  using VDataType = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
-  using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
-  using ODataType = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
-
-  using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
-
-  struct KargsCommon {
-    const QDataType* q_ptr;
-    const KDataType* k_ptr;
-    const VDataType* v_ptr;
-    ODataType* o_ptr;
-
-    ck::index_t seqlen_q;
-    ck::index_t seqlen_k;
-    ck::index_t hdim_q;
-    ck::index_t hdim_v;
-
-    float scale;
-
-    ck::index_t stride_q;
-    ck::index_t stride_k;
-    ck::index_t stride_v;
-    ck::index_t stride_o;
-
-    ck::index_t nhead_stride_q;
-    ck::index_t nhead_stride_k;
-    ck::index_t nhead_stride_v;
-    ck::index_t nhead_stride_o;
-
-    // following attributes are optional
-    const BiasDataType* bias_ptr = nullptr;
-    ck::index_t stride_bias = 0;
-    ck::index_t nhead_stride_bias = 0;
-  };
-
-  struct KargsBatchMode : KargsCommon {
-    ck::index_t batch_stride_q;
-    ck::index_t batch_stride_k;
-    ck::index_t batch_stride_v;
-    ck::index_t batch_stride_o;
-
-    // following attributes are optional
-    ck::index_t batch_stride_bias = 0;
-  };
-
-  struct KargsGroupMode : KargsCommon {
-    const ck::index_t* seqstart_q_ptr;
-    const ck::index_t* seqstart_k_ptr;
-    const ck::index_t* seqlen_k_ptr;
-  };
-
-  __host__ static constexpr void InitKargsCommon(
-      KargsCommon& kargs,
-      const void* q_ptr,
-      const void* k_ptr,
-      const void* v_ptr,
-      void* o_ptr,
-      ck::index_t seqlen_q,
-      ck::index_t seqlen_k,
-      ck::index_t hdim_q,
-      ck::index_t hdim_v,
-      float scale,
-      ck::index_t stride_q,
-      ck::index_t stride_k,
-      ck::index_t stride_v,
-      ck::index_t stride_o,
-      ck::index_t nhead_stride_q,
-      ck::index_t nhead_stride_k,
-      ck::index_t nhead_stride_v,
-      ck::index_t nhead_stride_o) {
-    kargs.q_ptr = reinterpret_cast<const QDataType*>(q_ptr);
-    kargs.k_ptr = reinterpret_cast<const KDataType*>(k_ptr);
-    kargs.v_ptr = reinterpret_cast<const VDataType*>(v_ptr);
-    kargs.o_ptr = reinterpret_cast<ODataType*>(o_ptr);
-
-    kargs.seqlen_q = seqlen_q;
-    kargs.seqlen_k = seqlen_k;
-    kargs.hdim_q = hdim_q;
-    kargs.hdim_v = hdim_v;
-
-    kargs.scale = scale;
-
-    kargs.stride_q = stride_q;
-    kargs.stride_k = stride_k;
-    kargs.stride_v = stride_v;
-    kargs.stride_o = stride_o;
-
-    kargs.nhead_stride_q = nhead_stride_q;
-    kargs.nhead_stride_k = nhead_stride_k;
-    kargs.nhead_stride_v = nhead_stride_v;
-    kargs.nhead_stride_o = nhead_stride_o;
-  }
-
-  __host__ static constexpr void InitKargsCommonBias(
-      KargsCommon& kargs,
-      const void* bias_ptr,
-      ck::index_t stride_bias,
-      ck::index_t nhead_stride_bias) {
-    kargs.bias_ptr = reinterpret_cast<const BiasDataType*>(bias_ptr);
-    kargs.stride_bias = stride_bias;
-    kargs.nhead_stride_bias = nhead_stride_bias;
-  }
-
-  // initialize kernel arguments for batch mode
-  __host__ static constexpr auto MakeKargs(
-      const void* q_ptr,
-      const void* k_ptr,
-      const void* v_ptr,
-      void* o_ptr,
-      ck::index_t seqlen_q,
-      ck::index_t seqlen_k,
-      ck::index_t hdim_q,
-      ck::index_t hdim_v,
-      float scale,
-      ck::index_t stride_q,
-      ck::index_t stride_k,
-      ck::index_t stride_v,
-      ck::index_t stride_o,
-      ck::index_t nhead_stride_q,
-      ck::index_t nhead_stride_k,
-      ck::index_t nhead_stride_v,
-      ck::index_t nhead_stride_o,
-      ck::index_t batch_stride_q,
-      ck::index_t batch_stride_k,
-      ck::index_t batch_stride_v,
-      ck::index_t batch_stride_o,
-      std::optional<
-          std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias =
-          std::nullopt) {
-    KargsBatchMode kargs;
-
-    InitKargsCommon(
-        kargs,
-        q_ptr,
-        k_ptr,
-        v_ptr,
-        o_ptr,
-        seqlen_q,
-        seqlen_k,
-        hdim_q,
-        hdim_v,
-        scale,
-        stride_q,
-        stride_k,
-        stride_v,
-        stride_o,
-        nhead_stride_q,
-        nhead_stride_k,
-        nhead_stride_v,
-        nhead_stride_o);
-
-    kargs.batch_stride_q = batch_stride_q;
-    kargs.batch_stride_k = batch_stride_k;
-    kargs.batch_stride_v = batch_stride_v;
-    kargs.batch_stride_o = batch_stride_o;
-
-    if (bias.has_value()) {
-      InitKargsCommonBias(
-          kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
-
-      kargs.batch_stride_bias = std::get<3>(*bias);
+template <typename TilePartitioner_,
+          typename FmhaPipeline_,
+          typename EpiloguePipeline_,
+          bool NeedPadding>
+struct FmhaFwdKernel
+{
+    using TilePartitioner                   = ck::remove_cvref_t<TilePartitioner_>;
+    using FmhaPipeline                      = ck::remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline                  = ck::remove_cvref_t<EpiloguePipeline_>;
+    static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
+
+    using QDataType    = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
+    using KDataType    = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
+    using VDataType    = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
+    using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
+    using ODataType    = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
+
+    using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
+
+    struct KargsCommon
+    {
+        const QDataType* q_ptr;
+        const KDataType* k_ptr;
+        const VDataType* v_ptr;
+        ODataType* o_ptr;
+
+        ck::index_t seqlen_q;
+        ck::index_t seqlen_k;
+        ck::index_t hdim_q;
+        ck::index_t hdim_v;
+
+        float scale;
+
+        ck::index_t stride_q;
+        ck::index_t stride_k;
+        ck::index_t stride_v;
+        ck::index_t stride_o;
+
+        ck::index_t nhead_stride_q;
+        ck::index_t nhead_stride_k;
+        ck::index_t nhead_stride_v;
+        ck::index_t nhead_stride_o;
+
+        // following attributes are optional
+        const BiasDataType* bias_ptr  = nullptr;
+        ck::index_t stride_bias       = 0;
+        ck::index_t nhead_stride_bias = 0;
+    };
+
+    struct KargsBatchMode : KargsCommon
+    {
+        ck::index_t batch_stride_q;
+        ck::index_t batch_stride_k;
+        ck::index_t batch_stride_v;
+        ck::index_t batch_stride_o;
+
+        // following attributes are optional
+        ck::index_t batch_stride_bias = 0;
+    };
+
+    struct KargsGroupMode : KargsCommon
+    {
+        const int32_t* seqstart_q_ptr;
+        const int32_t* seqstart_k_ptr;
+        const int32_t* seqlen_k_ptr;
+    };
+
+    __host__ static constexpr void InitKargsCommon(KargsCommon& kargs,
+                                                   const void* q_ptr,
+                                                   const void* k_ptr,
+                                                   const void* v_ptr,
+                                                   void* o_ptr,
+                                                   ck::index_t seqlen_q,
+                                                   ck::index_t seqlen_k,
+                                                   ck::index_t hdim_q,
+                                                   ck::index_t hdim_v,
+                                                   float scale,
+                                                   ck::index_t stride_q,
+                                                   ck::index_t stride_k,
+                                                   ck::index_t stride_v,
+                                                   ck::index_t stride_o,
+                                                   ck::index_t nhead_stride_q,
+                                                   ck::index_t nhead_stride_k,
+                                                   ck::index_t nhead_stride_v,
+                                                   ck::index_t nhead_stride_o)
+    {
+        kargs.q_ptr = reinterpret_cast<const QDataType*>(q_ptr);
+        kargs.k_ptr = reinterpret_cast<const KDataType*>(k_ptr);
+        kargs.v_ptr = reinterpret_cast<const VDataType*>(v_ptr);
+        kargs.o_ptr = reinterpret_cast<ODataType*>(o_ptr);
+
+        kargs.seqlen_q = seqlen_q;
+        kargs.seqlen_k = seqlen_k;
+        kargs.hdim_q   = hdim_q;
+        kargs.hdim_v   = hdim_v;
+
+        kargs.scale = scale;
+
+        kargs.stride_q = stride_q;
+        kargs.stride_k = stride_k;
+        kargs.stride_v = stride_v;
+        kargs.stride_o = stride_o;
+
+        kargs.nhead_stride_q = nhead_stride_q;
+        kargs.nhead_stride_k = nhead_stride_k;
+        kargs.nhead_stride_v = nhead_stride_v;
+        kargs.nhead_stride_o = nhead_stride_o;
+    }
+
+    __host__ static constexpr void InitKargsCommonBias(KargsCommon& kargs,
+                                                       const void* bias_ptr,
+                                                       ck::index_t stride_bias,
+                                                       ck::index_t nhead_stride_bias)
+    {
+        kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(bias_ptr);
+        kargs.stride_bias       = stride_bias;
+        kargs.nhead_stride_bias = nhead_stride_bias;
+    }
+
+    // initialize kernel arguments for batch mode
+    __host__ static constexpr auto
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              void* o_ptr,
+              ck::index_t seqlen_q,
+              ck::index_t seqlen_k,
+              ck::index_t hdim_q,
+              ck::index_t hdim_v,
+              float scale,
+              ck::index_t stride_q,
+              ck::index_t stride_k,
+              ck::index_t stride_v,
+              ck::index_t stride_o,
+              ck::index_t nhead_stride_q,
+              ck::index_t nhead_stride_k,
+              ck::index_t nhead_stride_v,
+              ck::index_t nhead_stride_o,
+              ck::index_t batch_stride_q,
+              ck::index_t batch_stride_k,
+              ck::index_t batch_stride_v,
+              ck::index_t batch_stride_o,
+              std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias =
+                  std::nullopt)
+    {
+        KargsBatchMode kargs;
+
+        InitKargsCommon(kargs,
+                        q_ptr,
+                        k_ptr,
+                        v_ptr,
+                        o_ptr,
+                        seqlen_q,
+                        seqlen_k,
+                        hdim_q,
+                        hdim_v,
+                        scale,
+                        stride_q,
+                        stride_k,
+                        stride_v,
+                        stride_o,
+                        nhead_stride_q,
+                        nhead_stride_k,
+                        nhead_stride_v,
+                        nhead_stride_o);
+
+        kargs.batch_stride_q = batch_stride_q;
+        kargs.batch_stride_k = batch_stride_k;
+        kargs.batch_stride_v = batch_stride_v;
+        kargs.batch_stride_o = batch_stride_o;
+
+        if(bias.has_value())
+        {
+            InitKargsCommonBias(kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
+
+            kargs.batch_stride_bias = std::get<3>(*bias);
+        }
+
+        return kargs;
     }
 
-    return kargs;
-  }
-
-  // initialize kernel arguments for group mode
-  __host__ static constexpr auto MakeKargs(
-      const void* q_ptr,
-      const void* k_ptr,
-      const void* v_ptr,
-      void* o_ptr,
-      const void* seqstart_q_ptr,
-      const void* seqstart_k_ptr,
-      const void* seqlen_k_ptr,
-      ck::index_t hdim_q,
-      ck::index_t hdim_v,
-      float scale,
-      ck::index_t stride_q,
-      ck::index_t stride_k,
-      ck::index_t stride_v,
-      ck::index_t stride_o,
-      ck::index_t nhead_stride_q,
-      ck::index_t nhead_stride_k,
-      ck::index_t nhead_stride_v,
-      ck::index_t nhead_stride_o,
-      std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias =
-          std::nullopt) {
-    KargsGroupMode kargs;
-
-    InitKargsCommon(
-        kargs,
-        q_ptr,
-        k_ptr,
-        v_ptr,
-        o_ptr,
-        -1, // seqlen_q will be updated inside the kernel
-        -1, // seqlen_k will be updated inside the kernel
-        hdim_q,
-        hdim_v,
-        scale,
-        stride_q,
-        stride_k,
-        stride_v,
-        stride_o,
-        nhead_stride_q,
-        nhead_stride_k,
-        nhead_stride_v,
-        nhead_stride_o);
-
-    if (bias.has_value()) {
-      InitKargsCommonBias(
-          kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
+    // initialize kernel arguments for group mode
+    __host__ static constexpr auto
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              void* o_ptr,
+              const void* seqstart_q_ptr,
+              const void* seqstart_k_ptr,
+              const void* seqlen_k_ptr,
+              ck::index_t hdim_q,
+              ck::index_t hdim_v,
+              float scale,
+              ck::index_t stride_q,
+              ck::index_t stride_k,
+              ck::index_t stride_v,
+              ck::index_t stride_o,
+              ck::index_t nhead_stride_q,
+              ck::index_t nhead_stride_k,
+              ck::index_t nhead_stride_v,
+              ck::index_t nhead_stride_o,
+              std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias = std::nullopt)
+    {
+        KargsGroupMode kargs;
+
+        InitKargsCommon(kargs,
+                        q_ptr,
+                        k_ptr,
+                        v_ptr,
+                        o_ptr,
+                        -1, // seqlen_q will be updated inside the kernel
+                        -1, // seqlen_k will be updated inside the kernel
+                        hdim_q,
+                        hdim_v,
+                        scale,
+                        stride_q,
+                        stride_k,
+                        stride_v,
+                        stride_o,
+                        nhead_stride_q,
+                        nhead_stride_k,
+                        nhead_stride_v,
+                        nhead_stride_o);
+
+        if(bias.has_value())
+        {
+            InitKargsCommonBias(kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
+        }
+
+        kargs.seqstart_q_ptr = reinterpret_cast<const int32_t*>(seqstart_q_ptr);
+        kargs.seqstart_k_ptr = reinterpret_cast<const int32_t*>(seqstart_k_ptr);
+        kargs.seqlen_k_ptr   = reinterpret_cast<const int32_t*>(seqlen_k_ptr);
+
+        return kargs;
     }
 
-    kargs.seqstart_q_ptr = reinterpret_cast<const ck::index_t*>(seqstart_q_ptr);
-    kargs.seqstart_k_ptr = reinterpret_cast<const ck::index_t*>(seqstart_k_ptr);
-    kargs.seqlen_k_ptr = reinterpret_cast<const ck::index_t*>(seqlen_k_ptr);
-
-    return kargs;
-  }
-
-  __host__ static constexpr auto GridSize(
-      ck::index_t batch_size_,
-      ck::index_t nhead_,
-      ck::index_t seqlen_q_,
-      ck::index_t hdim_v_) {
-    return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
-  }
-
-  __host__ static constexpr auto BlockSize() {
-    return dim3(kBlockSize);
-  }
-
-  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
-    return ck::math::max(
-        FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
-  }
-
-  template <typename Kargs>
-  __device__ void operator()(Kargs kargs) const {
-    using namespace ck;
-    using namespace ck::tile_program;
-    using namespace ck::tile_program::block;
-
-    // allocate LDS
-    __shared__ char smem_ptr[GetSmemSize()];
-
-    // divide problem
-    const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
-        TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
-
-    const index_t i_m0 =
-        __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-    const index_t i_n1 =
-        __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-    index_t batch_offset_q = 0;
-    index_t batch_offset_k = 0;
-    index_t batch_offset_v = 0;
-    index_t batch_offset_bias = 0;
-    index_t batch_offset_o = 0;
-
-    if constexpr (is_same_v<Kargs, KargsBatchMode>) {
-      batch_offset_q = i_batch * kargs.batch_stride_q;
-      batch_offset_k = i_batch * kargs.batch_stride_k;
-      batch_offset_v = i_batch * kargs.batch_stride_v;
-      batch_offset_bias = i_batch * kargs.batch_stride_bias;
-      batch_offset_o = i_batch * kargs.batch_stride_o;
-    } else { // is_same_v<Kargs, KargsGroupMode>
-      // get starting offset for each work batch
-      const index_t query_start = kargs.seqstart_q_ptr[i_batch];
-      const index_t key_start = kargs.seqstart_k_ptr[i_batch];
-
-      batch_offset_q = query_start * kargs.stride_q;
-      batch_offset_k = key_start * kargs.stride_k;
-      batch_offset_v = key_start * kargs.stride_v;
-      batch_offset_bias = query_start * kargs.stride_bias + key_start;
-      batch_offset_o = query_start * kargs.stride_o;
-
-      // get real # queries & # keys under group mode
-      const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-      kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
-
-      if (kargs.seqlen_k_ptr != nullptr) {
-        kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
-      } else {
-        const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-        kargs.seqlen_k =
-            adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
-      }
+    __host__ static constexpr auto GridSize(ck::index_t batch_size_,
+                                            ck::index_t nhead_,
+                                            ck::index_t seqlen_q_,
+                                            ck::index_t hdim_v_)
+    {
+        return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
     }
 
-    // for simplicity, batch stride we just modify the pointer
-    const QDataType* q_ptr =
-        kargs.q_ptr + i_nhead * kargs.nhead_stride_q + batch_offset_q;
-    const KDataType* k_ptr =
-        kargs.k_ptr + i_nhead * kargs.nhead_stride_k + batch_offset_k;
-    const VDataType* v_ptr =
-        kargs.v_ptr + i_nhead * kargs.nhead_stride_v + batch_offset_v;
-    const BiasDataType* bias_ptr = nullptr;
-    if (kargs.bias_ptr != nullptr) {
-      bias_ptr = kargs.bias_ptr + i_nhead * kargs.nhead_stride_bias +
-          batch_offset_bias;
+    __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
+
+    __host__ __device__ static constexpr ck::index_t GetSmemSize()
+    {
+        return ck::math::max(FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
-    ODataType* o_ptr =
-        kargs.o_ptr + i_nhead * kargs.nhead_stride_o + batch_offset_o;
-
-    // Q/K/V DRAM and DRAM window
-    const auto q_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-        q_ptr,
-        make_tuple(kargs.seqlen_q, kargs.hdim_q),
-        make_tuple(kargs.stride_q, 1),
-        Number<32>{},
-        Number<1>{});
-
-    const auto k_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-        k_ptr,
-        make_tuple(kargs.seqlen_k, kargs.hdim_q),
-        make_tuple(kargs.stride_k, 1),
-        Number<32>{},
-        Number<1>{});
-
-    const auto v_dram = [&]() {
-      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
-        const auto v_dram_tmp =
-            make_naive_tensor_view<AddressSpaceEnum::Global>(
-                v_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                make_tuple(kargs.stride_v, 1),
+
+    template <typename Kargs>
+    __device__ void operator()(Kargs kargs) const
+    {
+        using namespace ck;
+        using namespace ck::tile_program;
+        using namespace ck::tile_program::block;
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        // divide problem
+        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
+            TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
+
+        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+        index_t batch_offset_q    = 0;
+        index_t batch_offset_k    = 0;
+        index_t batch_offset_v    = 0;
+        index_t batch_offset_bias = 0;
+        index_t batch_offset_o    = 0;
+
+        if constexpr(ck::is_same_v<Kargs, KargsBatchMode>)
+        {
+            batch_offset_q    = i_batch * kargs.batch_stride_q;
+            batch_offset_k    = i_batch * kargs.batch_stride_k;
+            batch_offset_v    = i_batch * kargs.batch_stride_v;
+            batch_offset_bias = i_batch * kargs.batch_stride_bias;
+            batch_offset_o    = i_batch * kargs.batch_stride_o;
+        }
+        else
+        { // ck::is_same_v<Kargs, KargsGroupMode>
+            // get starting offset for each work batch
+            const index_t query_start = kargs.seqstart_q_ptr[i_batch];
+            const index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+            batch_offset_q = query_start * kargs.stride_q;
+            batch_offset_k = key_start * kargs.stride_k;
+            if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
+            {
+                batch_offset_v = key_start * kargs.stride_v;
+            }
+            else
+            {
+                batch_offset_v = key_start;
+            }
+            batch_offset_bias = query_start * kargs.stride_bias + key_start;
+            batch_offset_o    = query_start * kargs.stride_o;
+
+            // get real # queries & # keys under group mode
+            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+            // # of required blocks is different in each groups, terminate unnecessary
+            // blocks earlier
+            if(kargs.seqlen_q <= i_m0)
+            {
+                return;
+            }
+
+            if(kargs.seqlen_k_ptr != nullptr)
+            {
+                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+            }
+            else
+            {
+                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+            }
+        }
+
+        // for simplicity, batch stride we just modify the pointer
+        const QDataType* q_ptr = kargs.q_ptr + i_nhead * kargs.nhead_stride_q + batch_offset_q;
+        const KDataType* k_ptr = kargs.k_ptr + i_nhead * kargs.nhead_stride_k + batch_offset_k;
+        const VDataType* v_ptr = kargs.v_ptr + i_nhead * kargs.nhead_stride_v + batch_offset_v;
+        const BiasDataType* bias_ptr = nullptr;
+        if(kargs.bias_ptr != nullptr)
+        {
+            bias_ptr = kargs.bias_ptr + i_nhead * kargs.nhead_stride_bias + batch_offset_bias;
+        }
+        ODataType* o_ptr = kargs.o_ptr + i_nhead * kargs.nhead_stride_o + batch_offset_o;
+
+        // Q/K/V DRAM and DRAM window
+        const auto q_dram = [&]() {
+            const auto q_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                q_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                make_tuple(kargs.stride_q, 1),
+                Number<32>{},
+                Number<1>{});
+
+            return pad_tensor_view(q_dram_naive,
+                                   make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
+                                   Sequence<NeedPadding, false>{});
+        }();
+        const auto k_dram = [&]() {
+            const auto k_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                k_ptr,
+                make_tuple(kargs.seqlen_k, kargs.hdim_q),
+                make_tuple(kargs.stride_k, 1),
                 Number<32>{},
                 Number<1>{});
-        return transform_tensor_view(
-            v_dram_tmp,
-            make_tuple(
-                make_pass_through_transform(kargs.hdim_v),
-                make_pass_through_transform(kargs.seqlen_k)),
-            make_tuple(Sequence<1>{}, Sequence<0>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-      } else {
-        return make_naive_tensor_view<AddressSpaceEnum::Global>(
-            v_ptr,
-            make_tuple(kargs.hdim_v, kargs.seqlen_k),
-            make_tuple(kargs.stride_v, 1),
-            Number<32>{},
-            Number<1>{});
-      }
-    }();
-
-    auto q_dram_window = make_tile_window(
-        q_dram,
-        [&]() {
-          if constexpr (FmhaPipeline::kQLoadOnce)
-            return make_tuple(
-                Number<FmhaPipeline::kM0>{},
-                Number<FmhaPipeline::kK0BlockLength>{});
-          else
-            return make_tuple(
-                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
-        }(),
-        {i_m0, 0});
-
-    auto k_dram_window = make_tile_window(
-        k_dram,
-        make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
-        {0, 0});
-
-    auto v_dram_window = make_tile_window(
-        v_dram,
-        make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-        {i_n1, 0});
-
-    const auto run_pipeline_with = [&](auto bias_dram_window) {
-      return FmhaPipeline{}(
-          q_dram_window,
-          k_dram_window,
-          v_dram_window,
-          bias_dram_window,
-          kargs.scale,
-          kargs.seqlen_k / FmhaPipeline::kN0,
-          kargs.hdim_q / FmhaPipeline::kK0,
-          smem_ptr);
-    };
 
-    auto o_acc_tile = [&]() {
-      constexpr auto bias_dram_window_lengths =
-          make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
-
-      if (bias_ptr != nullptr) {
-        const auto bias_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-            bias_ptr,
-            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-            make_tuple(kargs.stride_bias, 1),
-            Number<32>{},
-            Number<1>{});
-
-        auto bias_dram_window =
-            make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-
-        return run_pipeline_with(bias_dram_window);
-      } else {
-        auto dummy_bias_dram_window =
-            make_null_tile_window(bias_dram_window_lengths);
-
-        return run_pipeline_with(dummy_bias_dram_window);
-      }
-    }();
-
-    // O DRAM and O DRAM window
-    auto o_dram = make_naive_tensor_view<AddressSpaceEnum::Global>(
-        o_ptr,
-        make_tuple(kargs.seqlen_q, kargs.hdim_v),
-        make_tuple(kargs.stride_o, 1),
-        Number<32>{},
-        Number<1>{});
-
-    auto o_dram_window = make_tile_window(
-        o_dram,
-        make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-        {i_m0, i_n1});
-
-    EpiloguePipeline{}(o_dram_window, o_acc_tile);
-  }
+            return pad_tensor_view(k_dram_naive,
+                                   make_tuple(Number<FmhaPipeline::kN0>{}, Number<1>{}),
+                                   Sequence<NeedPadding, false>{});
+        }();
+        const auto v_dram = [&]() {
+            if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
+            {
+                const auto v_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                    v_ptr,
+                    make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                    make_tuple(kargs.stride_v, 1),
+                    Number<32>{},
+                    Number<1>{});
+
+                const auto v_dram_transposed =
+                    transform_tensor_view(v_dram_naive,
+                                          make_tuple(make_pass_through_transform(kargs.seqlen_k),
+                                                     make_pass_through_transform(kargs.hdim_v)),
+                                          make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                          make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+                /// FIXME: The return value of
+                /// v_dram_naive.GetTensorDescriptor().GetLength() is same as
+                ///   v_dram_transposed.GetTensorDescriptor().GetLength(). Replace
+                ///   following if-clause by pad_tensor_view() call after fixing this
+                ///   issue.
+                if constexpr(!NeedPadding)
+                {
+                    return v_dram_transposed;
+                }
+                else
+                {
+                    const index_t pad_length =
+                        FmhaPipeline::kK1 *
+                            ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kK1) -
+                        kargs.seqlen_k;
+
+                    return transform_tensor_view(
+                        v_dram_transposed,
+                        make_tuple(make_pass_through_transform(kargs.hdim_v),
+                                   make_right_pad_transform(kargs.seqlen_k, pad_length)),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+                }
+            }
+            else
+            {
+                const auto v_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                    v_ptr,
+                    make_tuple(kargs.hdim_v, kargs.seqlen_k),
+                    make_tuple(kargs.stride_v, 1),
+                    Number<32>{},
+                    Number<1>{});
+
+                return pad_tensor_view(v_dram_naive,
+                                       make_tuple(Number<1>{}, Number<FmhaPipeline::kK1>{}),
+                                       Sequence<false, NeedPadding>{});
+            }
+        }();
+
+        auto q_dram_window = make_tile_window(
+            q_dram,
+            [&]() {
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                    return make_tuple(Number<FmhaPipeline::kM0>{},
+                                      Number<FmhaPipeline::kK0BlockLength>{});
+                else
+                    return make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
+            }(),
+            {i_m0, 0});
+
+        auto k_dram_window = make_tile_window(
+            k_dram, make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}), {0, 0});
+
+        auto v_dram_window =
+            make_tile_window(v_dram,
+                             make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+                             {i_n1, 0});
+
+        const auto run_pipeline_with = [&](auto bias_dram_window) {
+            const auto s_mask = [&]() {
+                if constexpr(NeedPadding)
+                {
+                    return [&](index_t /* m */, index_t n) {
+                        const bool is_out_of_bound = !(n < kargs.seqlen_k);
+                        return is_out_of_bound;
+                    };
+                }
+                else
+                {
+                    return NullMask{};
+                }
+            }();
+
+            return FmhaPipeline{}(q_dram_window,
+                                  k_dram_window,
+                                  v_dram_window,
+                                  bias_dram_window,
+                                  s_mask,
+                                  kargs.scale,
+                                  ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
+                                  ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
+                                  smem_ptr);
+        };
+
+        auto o_acc_tile = [&]() {
+            constexpr auto bias_dram_window_lengths =
+                make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
+
+            if(bias_ptr != nullptr)
+            {
+                const auto bias_dram = [&]() {
+                    const auto bias_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                        bias_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                        make_tuple(kargs.stride_bias, 1),
+                        Number<32>{},
+                        Number<1>{});
+
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           Sequence<NeedPadding, NeedPadding>{});
+                }();
+
+                auto bias_dram_window =
+                    make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+
+                return run_pipeline_with(bias_dram_window);
+            }
+            else
+            {
+                auto dummy_bias_dram_window = make_null_tile_window(bias_dram_window_lengths);
+
+                return run_pipeline_with(dummy_bias_dram_window);
+            }
+        }();
+
+        // O DRAM and O DRAM window
+        auto o_dram = [&]() {
+            const auto o_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                o_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                make_tuple(kargs.stride_o, 1),
+                Number<32>{},
+                Number<1>{});
+
+            return pad_tensor_view(o_dram_naive,
+                                   make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
+                                   Sequence<NeedPadding, false>{});
+        }();
+
+        auto o_dram_window =
+            make_tile_window(o_dram,
+                             make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
+                             {i_m0, i_n1});
+
+        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+    }
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
index 4073424fc..2289b09db 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
@@ -1,34 +1,32 @@
 #pragma once
 
+#include "ck/utility/common_header.hpp"
 #include "ck/tile_program/tile/store_tile.hpp"
 #include "ck/tile_program/tile/tile_elementwise.hpp"
-#include "ck/utility/common_header.hpp"
 
 template <typename OaccDataType_, typename ODataType_>
-struct FmhaFwdEpilogueProblem {
-  using OaccDataType = ck::remove_cvref_t<OaccDataType_>;
-  using ODataType = ck::remove_cvref_t<ODataType_>;
+struct FmhaFwdEpilogueProblem
+{
+    using OaccDataType = ck::remove_cvref_t<OaccDataType_>;
+    using ODataType    = ck::remove_cvref_t<ODataType_>;
 };
 
 template <typename Problem_, typename Policy_ = void>
-struct FmhaFwdEpilogue {
-  using Problem = ck::remove_cvref_t<Problem_>;
-  using OaccDataType = ck::remove_cvref_t<typename Problem::OaccDataType>;
-  using ODataType = ck::remove_cvref_t<typename Problem::ODataType>;
+struct FmhaFwdEpilogue
+{
+    using Problem      = ck::remove_cvref_t<Problem_>;
+    using OaccDataType = ck::remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType    = ck::remove_cvref_t<typename Problem::ODataType>;
 
-  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
-    return 0;
-  }
+    __host__ __device__ static constexpr ck::index_t GetSmemSize() { return 0; }
 
-  template <typename ODramWindowTmp, typename OAccTile>
-  __device__ auto operator()(
-      ODramWindowTmp& o_dram_window_tmp,
-      const OAccTile& o_acc_tile) {
-    using namespace ck;
-    using namespace ck::tile_program;
+    template <typename ODramWindowTmp, typename OAccTile>
+    __device__ auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile)
+    {
+        using namespace ck;
+        using namespace ck::tile_program;
 
-    const auto o =
-        tile_elementwise_in(type_convert<ODataType, OaccDataType>, o_acc_tile);
-    store_tile(o_dram_window_tmp, o);
-  }
+        const auto o = tile_elementwise_in(type_convert<ODataType, OaccDataType>, o_acc_tile);
+        store_tile(o_dram_window_tmp, o);
+    }
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
index 113037ce3..5d95c96f7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
@@ -1,46 +1,51 @@
 #pragma once
 
+#include "ck/utility/common_header.hpp"
 #include "ck/tile_program/tile/store_tile.hpp"
 #include "ck/tile_program/tile/tile_elementwise.hpp"
-#include "ck/utility/common_header.hpp"
 
 template <typename BlockFmhaShape_>
-struct FmhaFwdTilePartitioner {
-  using BlockFmhaShape = ck::remove_cvref_t<BlockFmhaShape_>;
-
-  static constexpr ck::index_t kM0 = BlockFmhaShape::kM0;
-  static constexpr ck::index_t kN0 = BlockFmhaShape::kN0;
-  static constexpr ck::index_t kK0 = BlockFmhaShape::kK0;
-  static constexpr ck::index_t kN1 = BlockFmhaShape::kN1;
-  static constexpr ck::index_t kK1 = BlockFmhaShape::kK1;
-
-  __host__ static constexpr auto GridSize(
-      ck::index_t batch_size_,
-      ck::index_t nhead_,
-      ck::index_t seqlen_q_,
-      ck::index_t hdim_v_) {
-    // TODO: this may need tuning
-    return dim3((seqlen_q_ / kM0) * (hdim_v_ / kN1), batch_size_, nhead_);
-  }
-
-  __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v) {
-    using namespace ck;
-
-    // const index_t num_tile_m0 = seqlen_q / kM0;
-    const index_t num_tile_n1 = hdim_v / kN1;
-
-    const index_t i_block = blockIdx.x;
-    const index_t i_batch = blockIdx.y;
-    const index_t i_nhead = blockIdx.z;
-
-    const auto f = [](index_t dividend, index_t divisor) {
-      index_t quotient = dividend / divisor;
-      index_t modulus = dividend - quotient * divisor;
-      return ck::make_tuple(quotient, modulus);
-    };
-
-    const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-    return ck::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
-  }
+struct FmhaFwdTilePartitioner
+{
+    using BlockFmhaShape = ck::remove_cvref_t<BlockFmhaShape_>;
+
+    static constexpr ck::index_t kM0 = BlockFmhaShape::kM0;
+    static constexpr ck::index_t kN0 = BlockFmhaShape::kN0;
+    static constexpr ck::index_t kK0 = BlockFmhaShape::kK0;
+    static constexpr ck::index_t kN1 = BlockFmhaShape::kN1;
+    static constexpr ck::index_t kK1 = BlockFmhaShape::kK1;
+
+    __host__ static constexpr auto GridSize(ck::index_t batch_size_,
+                                            ck::index_t nhead_,
+                                            ck::index_t seqlen_q_,
+                                            ck::index_t hdim_v_)
+    {
+        // TODO: this may need tuning
+        return dim3(ck::math::integer_divide_ceil(seqlen_q_, kM0) *
+                        ck::math::integer_divide_ceil(hdim_v_, kN1),
+                    batch_size_,
+                    nhead_);
+    }
+
+    __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v)
+    {
+        using namespace ck;
+
+        // const index_t num_tile_m0 = seqlen_q / kM0;
+        const index_t num_tile_n1 = hdim_v / kN1;
+
+        const index_t i_block = blockIdx.x;
+        const index_t i_batch = blockIdx.y;
+        const index_t i_nhead = blockIdx.z;
+
+        const auto f = [](index_t dividend, index_t divisor) {
+            index_t quotient = dividend / divisor;
+            index_t modulus  = dividend - quotient * divisor;
+            return ck::make_tuple(quotient, modulus);
+        };
+
+        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+        return ck::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+    }
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index b58bcfafb..54a477358 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -4,12 +4,12 @@
 #include <sstream>
 #include <stdexcept>
 
+#include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
 #include <ck/tensor_description/cluster_descriptor.hpp>
 #include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
+#include <ck/tensor/tensor_view.hpp>
 
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
@@ -26,156 +26,151 @@
 #include "ck_tiled_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type>
-struct grouped_infer_masktype_attnbias_dispatched {
-  using QDataType = scalar_t;
-  using KDataType = scalar_t;
-  using VDataType = scalar_t;
-  using BiasDataType = scalar_t;
-  using SaccDataType = float; // data type for first gemm accumulation
-  using SMPLComputeDataType = float; // data type for reduction, softmax
-  using PDataType = scalar_t; // data type for A matrix of second gemm
-  using OaccDataType = float; // data type for second gemm accumulation
-  using ODataType = scalar_t;
-
-  using VLayout = ck::tensor_layout::gemm::RowMajor;
-
-  using FmhaBlockTileHdim64 = ck::Sequence<128, 64, 32, 64, 32, 64>;
-  using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
-  using FmhaBlockWarps = ck::Sequence<4, 1, 1>;
-  using FmhaWarpTile = ck::Sequence<32, 32, 16>;
-  using FmhaShapeHDim64 = ck::tile_program::TileFmhaShape<
-      FmhaBlockTileHdim64,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      VLayout>;
-  using FmhaShapeHDim128 = ck::tile_program::TileFmhaShape<
-      FmhaBlockTileHdim128,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      FmhaBlockWarps,
-      FmhaWarpTile,
-      VLayout>;
-
-  using FmhaTilePartitionerHDim64 = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
-  using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
-  using FmhaPipelineProblemHDim64 =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          QDataType,
-          KDataType,
-          VDataType,
-          SaccDataType,
-          SMPLComputeDataType,
-          BiasDataType,
-          PDataType,
-          OaccDataType,
-          ODataType,
-          256, // BlockSize
-          FmhaShapeHDim64>;
-  using FmhaPipelineProblemHDim128 =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          QDataType,
-          KDataType,
-          VDataType,
-          SaccDataType,
-          SMPLComputeDataType,
-          BiasDataType,
-          PDataType,
-          OaccDataType,
-          ODataType,
-          256, // BlockSize
-          FmhaShapeHDim128>;
-
-  using FmhaPipelineHDim64 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-      FmhaPipelineProblemHDim64>;
-  using FmhaPipelineHDim128 = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-      FmhaPipelineProblemHDim128>;
-
-  using FmhaEpilogue =
-      FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
-  using FmhaKernelHDim64 = FmhaFwdKernel<
-      FmhaTilePartitionerHDim64,
-      FmhaPipelineHDim64,
-      FmhaEpilogue>;
-  using FmhaKernelHDim128 = FmhaFwdKernel<
-      FmhaTilePartitionerHDim128,
-      FmhaPipelineHDim128,
-      FmhaEpilogue>;
+struct grouped_infer_masktype_attnbias_dispatched
+{
+    using QDataType           = scalar_t;
+    using KDataType           = scalar_t;
+    using VDataType           = scalar_t;
+    using BiasDataType        = scalar_t;
+    using SaccDataType        = float;    // data type for first gemm accumulation
+    using SMPLComputeDataType = float;    // data type for reduction, softmax
+    using PDataType           = scalar_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;    // data type for second gemm accumulation
+    using ODataType           = scalar_t;
+
+    using VLayout = ck::tensor_layout::gemm::RowMajor;
+
+    using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
+    using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
+    using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
+    using FmhaWarpTile         = ck::Sequence<32, 32, 16>;
+    using FmhaShapeHDim64      = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim64,
+                                                            FmhaBlockWarps,
+                                                            FmhaWarpTile,
+                                                            FmhaBlockWarps,
+                                                            FmhaWarpTile,
+                                                            VLayout>;
+    using FmhaShapeHDim128     = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim128,
+                                                             FmhaBlockWarps,
+                                                             FmhaWarpTile,
+                                                             FmhaBlockWarps,
+                                                             FmhaWarpTile,
+                                                             VLayout>;
+
+    using FmhaTilePartitionerHDim64  = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
+    using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
+    using FmhaPipelineProblemHDim64 =
+        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                          KDataType,
+                                                          VDataType,
+                                                          SaccDataType,
+                                                          SMPLComputeDataType,
+                                                          BiasDataType,
+                                                          PDataType,
+                                                          OaccDataType,
+                                                          ODataType,
+                                                          256, // BlockSize
+                                                          FmhaShapeHDim64>;
+    using FmhaPipelineProblemHDim128 =
+        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                          KDataType,
+                                                          VDataType,
+                                                          SaccDataType,
+                                                          SMPLComputeDataType,
+                                                          BiasDataType,
+                                                          PDataType,
+                                                          OaccDataType,
+                                                          ODataType,
+                                                          256, // BlockSize
+                                                          FmhaShapeHDim128>;
+
+    using FmhaPipelineHDim64 =
+        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim64>;
+    using FmhaPipelineHDim128 =
+        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim128>;
+
+    using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
+
+    // ToDo: define NeedPadding according to runtime lengths
+    static constexpr bool NeedPadding = true;
+
+    using FmhaKernelHDim64 =
+        FmhaFwdKernel<FmhaTilePartitionerHDim64, FmhaPipelineHDim64, FmhaEpilogue, NeedPadding>;
+    using FmhaKernelHDim128 =
+        FmhaFwdKernel<FmhaTilePartitionerHDim128, FmhaPipelineHDim128, FmhaEpilogue, NeedPadding>;
 
 #ifndef GROUPED_INFER_HEADDIM_SWITCH
-#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)  \
-  [&] {                                                          \
-    if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64) {             \
-      using FmhaKernel = FmhaKernelHDim64;                       \
-      __VA_ARGS__();                                             \
-    } else if (HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128) {     \
-      using FmhaKernel = FmhaKernelHDim128;                      \
-      __VA_ARGS__();                                             \
-    } else {                                                     \
-      throw std::runtime_error("Head-dim sizes not supported!"); \
-    }                                                            \
-  }()
+#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)        \
+    [&] {                                                              \
+        if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64)                  \
+        {                                                              \
+            using FmhaKernel = FmhaKernelHDim64;                       \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128)            \
+        {                                                              \
+            using FmhaKernel = FmhaKernelHDim128;                      \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else                                                           \
+        {                                                              \
+            throw std::runtime_error("Head-dim sizes not supported!"); \
+        }                                                              \
+    }()
 #endif
 
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    GROUPED_INFER_HEADDIM_SWITCH(
-        param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
-  };
-
-  template <typename FmhaKernel>
-  static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream) {
-    dim3 kGridSize = FmhaKernel::GridSize(1, param.Hq, param.M, param.Kv);
-    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-
-    constexpr ck::index_t kWarpPerCu = 8; // 2 warps per SIMD
-    constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
-    constexpr ck::index_t kBlockPerCu = kWarpPerCu / kWarpPerBlock;
-
-    std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias;
-
-    if (param.has_attn_bias) {
-      bias = std::make_tuple(
-          param.attn_bias_ptr,
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[1]);
+    static void Run(GroupedForwardParams& param, hipStream_t stream)
+    {
+        GROUPED_INFER_HEADDIM_SWITCH(
+            param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
     };
 
-    auto kargs = FmhaKernel::MakeKargs(
-        param.q_ptr,
-        param.k_ptr,
-        param.v_ptr,
-        param.out_ptr,
-        param.seqstart_q_dev_ptr,
-        param.seqstart_k_dev_ptr,
-        param.seqlen_k_dev_ptr,
-        param.K, // hdim_q
-        param.Kv, // hdim_v
-        param.scale,
-        param.q_strides[1], // q, k, v, out tensor seq-dim stride
-        param.k_strides[1],
-        param.v_strides[1],
-        param.out_strides[1],
-        param.q_strides[2], // q, k, v, out tensor head-dim stride
-        param.k_strides[2],
-        param.v_strides[2],
-        param.out_strides[2],
-        bias);
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
-  };
+    template <typename FmhaKernel>
+    static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream)
+    {
+        dim3 kGridSize = FmhaKernel::GridSize(param.num_batches, param.Hq, param.M, param.Kv);
+        constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+
+        constexpr ck::index_t kWarpPerCu    = 8; // 2 warps per SIMD
+        constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
+        constexpr ck::index_t kBlockPerCu   = kWarpPerCu / kWarpPerBlock;
+
+        std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias;
+
+        if(param.has_attn_bias)
+        {
+            bias = std::make_tuple(
+                param.attn_bias_ptr, param.attn_bias_strides[2], param.attn_bias_strides[1]);
+        };
+
+        auto kargs =
+            FmhaKernel::MakeKargs(param.q_ptr,
+                                  param.k_ptr,
+                                  param.v_ptr,
+                                  param.out_ptr,
+                                  param.seqstart_q_dev_ptr,
+                                  param.seqstart_k_dev_ptr,
+                                  param.seqlen_k_dev_ptr,
+                                  param.K,  // hdim_q
+                                  param.Kv, // hdim_v
+                                  param.scale,
+                                  param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                                  param.k_strides[1],
+                                  param.v_strides[1],
+                                  param.out_strides[1],
+                                  param.q_strides[2], // q, k, v, out tensor head-dim stride
+                                  param.k_strides[2],
+                                  param.v_strides[2],
+                                  param.out_strides[2],
+                                  bias);
+
+        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
+    };
 };
 
 template <typename scalar_t, int32_t custom_mask_type>
-void run_grouped_infer_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream) {
-  grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(
-      param, stream);
+void run_grouped_infer_masktype_attnbias_dispatched(GroupedForwardParams& param, hipStream_t stream)
+{
+    grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(param, stream);
 };

From dd67c06587292d0dfffc6af26c0d0d5b8fbffafe Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 29 Nov 2023 18:54:47 +0000
Subject: [PATCH 245/641] Add runtime setting for NeedPadding for ck-tiled
 batched infer

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 79 ++++++++-----------
 1 file changed, 35 insertions(+), 44 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 4f8598d7c..38ab8ad4c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -55,59 +55,19 @@ struct batched_infer_masktype_attnbias_dispatched
                                                              FmhaWarpTile,
                                                              VLayout>;
 
-    using FmhaTilePartitionerHDim64  = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
-    using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
-    using FmhaPipelineProblemHDim64 =
-        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                          KDataType,
-                                                          VDataType,
-                                                          SaccDataType,
-                                                          SMPLComputeDataType,
-                                                          BiasDataType,
-                                                          PDataType,
-                                                          OaccDataType,
-                                                          ODataType,
-                                                          256, // BlockSize
-                                                          FmhaShapeHDim64>;
-    using FmhaPipelineProblemHDim128 =
-        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                          KDataType,
-                                                          VDataType,
-                                                          SaccDataType,
-                                                          SMPLComputeDataType,
-                                                          BiasDataType,
-                                                          PDataType,
-                                                          OaccDataType,
-                                                          ODataType,
-                                                          256, // BlockSize
-                                                          FmhaShapeHDim128>;
-
-    using FmhaPipelineHDim64 =
-        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim64>;
-    using FmhaPipelineHDim128 =
-        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim128>;
-
     using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
 
-    // ToDo: define NeedPadding according to runtime lengths
-    static constexpr bool NeedPadding = true;
-
-    using FmhaKernelHDim64 =
-        FmhaFwdKernel<FmhaTilePartitionerHDim64, FmhaPipelineHDim64, FmhaEpilogue, NeedPadding>;
-    using FmhaKernelHDim128 =
-        FmhaFwdKernel<FmhaTilePartitionerHDim128, FmhaPipelineHDim128, FmhaEpilogue, NeedPadding>;
-
 #ifndef BATCHED_INFER_HEADDIM_SWITCH
 #define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)        \
     [&] {                                                              \
         if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64)                  \
         {                                                              \
-            using FmhaKernel = FmhaKernelHDim64;                       \
+            using FmhaShape = FmhaShapeHDim64;                         \
             __VA_ARGS__();                                             \
         }                                                              \
         else if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128)            \
         {                                                              \
-            using FmhaKernel = FmhaKernelHDim128;                      \
+            using FmhaShape = FmhaShapeHDim128;                        \
             __VA_ARGS__();                                             \
         }                                                              \
         else                                                           \
@@ -119,8 +79,39 @@ struct batched_infer_masktype_attnbias_dispatched
 
     static void Run(BatchedForwardParams& param, hipStream_t stream)
     {
-        BATCHED_INFER_HEADDIM_SWITCH(
-            param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
+        BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+            using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+            using FmhaPipelineProblem =
+                ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                                  KDataType,
+                                                                  VDataType,
+                                                                  SaccDataType,
+                                                                  SMPLComputeDataType,
+                                                                  BiasDataType,
+                                                                  PDataType,
+                                                                  OaccDataType,
+                                                                  ODataType,
+                                                                  256, // BlockSize
+                                                                  FmhaShape>;
+
+            using FmhaPipeline =
+                ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+
+            if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
+            {
+                constexpr bool NeedPadding = false;
+                using FmhaKernel =
+                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue, NeedPadding>;
+                RunWithKernel<FmhaKernel>(param, stream);
+            }
+            else
+            {
+                constexpr bool NeedPadding = true;
+                using FmhaKernel =
+                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue, NeedPadding>;
+                RunWithKernel<FmhaKernel>(param, stream);
+            }
+        });
     };
 
     template <typename FmhaKernel>

From c3ddb79e89f669b2c77779213907e96eeeb665c6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 29 Nov 2023 19:39:37 +0000
Subject: [PATCH 246/641] Split NeedPadding into MNeedPadding and NNeedPadding

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 44 +++++++++--
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 17 +++--
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 75 ++++++++-----------
 3 files changed, 78 insertions(+), 58 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 38ab8ad4c..3492f61f3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -99,18 +99,48 @@ struct batched_infer_masktype_attnbias_dispatched
 
             if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
             {
-                constexpr bool NeedPadding = false;
-                using FmhaKernel =
-                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue, NeedPadding>;
+                constexpr bool MNeedPadding = false;
+                constexpr bool NNeedPadding = false;
+                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
+                                                 FmhaPipeline,
+                                                 FmhaEpilogue,
+                                                 MNeedPadding,
+                                                 NNeedPadding>;
                 RunWithKernel<FmhaKernel>(param, stream);
             }
-            else
+            else if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 != 0)
             {
-                constexpr bool NeedPadding = true;
-                using FmhaKernel =
-                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue, NeedPadding>;
+                constexpr bool MNeedPadding = false;
+                constexpr bool NNeedPadding = true;
+                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
+                                                 FmhaPipeline,
+                                                 FmhaEpilogue,
+                                                 MNeedPadding,
+                                                 NNeedPadding>;
                 RunWithKernel<FmhaKernel>(param, stream);
             }
+            else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 == 0)
+            {
+                constexpr bool MNeedPadding = true;
+                constexpr bool NNeedPadding = false;
+                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
+                                                 FmhaPipeline,
+                                                 FmhaEpilogue,
+                                                 MNeedPadding,
+                                                 NNeedPadding>;
+                RunWithKernel<FmhaKernel>(param, stream);
+            }
+            else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 != 0)
+            {
+                constexpr bool MNeedPadding = true;
+                constexpr bool NNeedPadding = true;
+                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
+                                                 FmhaPipeline,
+                                                 FmhaEpilogue,
+                                                 MNeedPadding,
+                                                 NNeedPadding>;
+                RunWithKernel<FmhaKernel>(param, stream);
+            };
         });
     };
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 9759c9832..e2b048546 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -18,7 +18,8 @@
 template <typename TilePartitioner_,
           typename FmhaPipeline_,
           typename EpiloguePipeline_,
-          bool NeedPadding>
+          bool MNeedPadding,
+          bool NNeedPadding>
 struct FmhaFwdKernel
 {
     using TilePartitioner                   = ck::remove_cvref_t<TilePartitioner_>;
@@ -360,7 +361,7 @@ struct FmhaFwdKernel
 
             return pad_tensor_view(q_dram_naive,
                                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
-                                   Sequence<NeedPadding, false>{});
+                                   Sequence<MNeedPadding, false>{});
         }();
         const auto k_dram = [&]() {
             const auto k_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
@@ -372,7 +373,7 @@ struct FmhaFwdKernel
 
             return pad_tensor_view(k_dram_naive,
                                    make_tuple(Number<FmhaPipeline::kN0>{}, Number<1>{}),
-                                   Sequence<NeedPadding, false>{});
+                                   Sequence<NNeedPadding, false>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
@@ -396,7 +397,7 @@ struct FmhaFwdKernel
                 ///   v_dram_transposed.GetTensorDescriptor().GetLength(). Replace
                 ///   following if-clause by pad_tensor_view() call after fixing this
                 ///   issue.
-                if constexpr(!NeedPadding)
+                if constexpr(!NNeedPadding)
                 {
                     return v_dram_transposed;
                 }
@@ -426,7 +427,7 @@ struct FmhaFwdKernel
 
                 return pad_tensor_view(v_dram_naive,
                                        make_tuple(Number<1>{}, Number<FmhaPipeline::kK1>{}),
-                                       Sequence<false, NeedPadding>{});
+                                       Sequence<false, NNeedPadding>{});
             }
         }();
 
@@ -451,7 +452,7 @@ struct FmhaFwdKernel
 
         const auto run_pipeline_with = [&](auto bias_dram_window) {
             const auto s_mask = [&]() {
-                if constexpr(NeedPadding)
+                if constexpr(NNeedPadding)
                 {
                     return [&](index_t /* m */, index_t n) {
                         const bool is_out_of_bound = !(n < kargs.seqlen_k);
@@ -491,7 +492,7 @@ struct FmhaFwdKernel
 
                     return pad_tensor_view(bias_dram_naive,
                                            bias_dram_window_lengths,
-                                           Sequence<NeedPadding, NeedPadding>{});
+                                           Sequence<MNeedPadding, NNeedPadding>{});
                 }();
 
                 auto bias_dram_window =
@@ -518,7 +519,7 @@ struct FmhaFwdKernel
 
             return pad_tensor_view(o_dram_naive,
                                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
-                                   Sequence<NeedPadding, false>{});
+                                   Sequence<MNeedPadding, false>{});
         }();
 
         auto o_dram_window =
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 54a477358..b52086fd7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -57,59 +57,24 @@ struct grouped_infer_masktype_attnbias_dispatched
                                                              FmhaWarpTile,
                                                              VLayout>;
 
-    using FmhaTilePartitionerHDim64  = FmhaFwdTilePartitioner<FmhaShapeHDim64>;
-    using FmhaTilePartitionerHDim128 = FmhaFwdTilePartitioner<FmhaShapeHDim128>;
-    using FmhaPipelineProblemHDim64 =
-        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                          KDataType,
-                                                          VDataType,
-                                                          SaccDataType,
-                                                          SMPLComputeDataType,
-                                                          BiasDataType,
-                                                          PDataType,
-                                                          OaccDataType,
-                                                          ODataType,
-                                                          256, // BlockSize
-                                                          FmhaShapeHDim64>;
-    using FmhaPipelineProblemHDim128 =
-        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                          KDataType,
-                                                          VDataType,
-                                                          SaccDataType,
-                                                          SMPLComputeDataType,
-                                                          BiasDataType,
-                                                          PDataType,
-                                                          OaccDataType,
-                                                          ODataType,
-                                                          256, // BlockSize
-                                                          FmhaShapeHDim128>;
-
-    using FmhaPipelineHDim64 =
-        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim64>;
-    using FmhaPipelineHDim128 =
-        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblemHDim128>;
-
     using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
 
-    // ToDo: define NeedPadding according to runtime lengths
-    static constexpr bool NeedPadding = true;
-
-    using FmhaKernelHDim64 =
-        FmhaFwdKernel<FmhaTilePartitionerHDim64, FmhaPipelineHDim64, FmhaEpilogue, NeedPadding>;
-    using FmhaKernelHDim128 =
-        FmhaFwdKernel<FmhaTilePartitionerHDim128, FmhaPipelineHDim128, FmhaEpilogue, NeedPadding>;
+    // This is the default setting, the effective setting should be done according to M/N size of
+    // each batch
+    static constexpr bool MNeedPadding = true;
+    static constexpr bool NNeedPadding = true;
 
 #ifndef GROUPED_INFER_HEADDIM_SWITCH
 #define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)        \
     [&] {                                                              \
         if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64)                  \
         {                                                              \
-            using FmhaKernel = FmhaKernelHDim64;                       \
+            using FmhaShape = FmhaShapeHDim64;                         \
             __VA_ARGS__();                                             \
         }                                                              \
         else if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128)            \
         {                                                              \
-            using FmhaKernel = FmhaKernelHDim128;                      \
+            using FmhaShape = FmhaShapeHDim128;                        \
             __VA_ARGS__();                                             \
         }                                                              \
         else                                                           \
@@ -121,8 +86,32 @@ struct grouped_infer_masktype_attnbias_dispatched
 
     static void Run(GroupedForwardParams& param, hipStream_t stream)
     {
-        GROUPED_INFER_HEADDIM_SWITCH(
-            param.K, param.Kv, [&] { RunWithKernel<FmhaKernel>(param, stream); });
+        GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+            using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+            using FmhaPipelineProblem =
+                ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                                  KDataType,
+                                                                  VDataType,
+                                                                  SaccDataType,
+                                                                  SMPLComputeDataType,
+                                                                  BiasDataType,
+                                                                  PDataType,
+                                                                  OaccDataType,
+                                                                  ODataType,
+                                                                  256, // BlockSize
+                                                                  FmhaShape>;
+
+            using FmhaPipeline =
+                ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+
+            using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner,
+                                             FmhaPipeline,
+                                             FmhaEpilogue,
+                                             MNeedPadding,
+                                             NNeedPadding>;
+
+            RunWithKernel<FmhaKernel>(param, stream);
+        });
     };
 
     template <typename FmhaKernel>

From aebe8ea1ee067b28f13761e0b80047e8d537886c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 30 Nov 2023 14:51:49 +0000
Subject: [PATCH 247/641] Add temporary scripts for ck-tiled verification and
 benchmarking

---
 tests/test_forward_ck_tiled.py                | 643 ++++++++++++++++++
 third_party/composable_kernel_tiled           |   2 +-
 .../benchmark_mem_eff_attention_ck_tiled.py   | 315 +++++++++
 ...benchmark_mem_eff_attn_decoder_ck_tiled.py | 206 ++++++
 4 files changed, 1165 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_forward_ck_tiled.py
 create mode 100644 xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
 create mode 100644 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
new file mode 100644
index 000000000..f295887e9
--- /dev/null
+++ b/tests/test_forward_ck_tiled.py
@@ -0,0 +1,643 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_types = [torch.float16, torch.bfloat16]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
+    fmha.ck.BwOp,
+]
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    for B in op._TEST_BATCH_SIZES:
+        for Mq in [32, 256]:
+            for Mkv in [32, 64, 256, 1024]:
+                for K in op._TEST_K:
+                    shapes.append((B, Mq, Mkv, 1, K, K))
+        Mq = 256
+        Mkv = 128
+        K = 32
+        H = 1
+        # Weird values of parameters
+        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
+            shapes.append((B, M, Mkv, H, K, K))
+            shapes.append((B, Mq, M, H, K, K))
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
+            if _K <= op.SUPPORTED_MAX_K:
+                shapes.append((B, Mq, Mkv, H, _K, _K))
+        # Different value for K / Kv
+        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+            for _K in [32, 36, 64, 256 + 8]:
+                shapes.append((B, Mq, Mkv, H, K, _K))
+                shapes.append((B, Mq, Mkv, H, _K, K))
+        # Exotic sizes
+        for K in op._TEST_K:
+            shapes.append((B, 16, 1024, H, K, K))
+            shapes.append((B, 1024, 16, H, K, K))
+        # Some number of heads
+        for H in [3, 5, 12]:
+            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Filter-out not supported shapes
+    shapes = [
+        shape
+        for shape in shapes
+        if len(
+            op.shape_not_supported_reasons(
+                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
+            )
+        )
+        == 0
+    ]
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        found_count = 0
+        while found_count < 20:
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
+                continue
+            found_count += 1
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def make_id(op, device, dtype, bias_type, *shape):
+    return (
+        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+        f"-{'-'.join([str(s) for s in shape])}"
+    )
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+    return {
+        "argvalues": combination,
+        "ids": [make_id(*c) for c in combination],
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
+)
+
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias, dtype=dtype)
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
+
+    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=dtype,
+            )
+        else:
+            attn_bias_tensor = attn_bias.to(dtype=dtype)
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
+            # make sure it also works if the first columns are partially masked out
+            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
+    tensor_with_grad: Optional[torch.Tensor] = None
+    if isinstance(attn_bias, torch.Tensor):
+        tensor_with_grad = attn_bias
+    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+        tensor_with_grad = attn_bias._bias
+    if tensor_with_grad is not None:
+        grad = tensor_with_grad.grad
+        if clear:
+            tensor_with_grad.grad = None
+        return grad
+    return None
+
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("packed", [False, True])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_forward(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed,
+    fmt,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    if bias_type is not None and bias_type is not type(None):
+        if bias_type is not torch.Tensor and bias_type is not fmha.attn_bias.BlockDiagonalMask:
+            pytest.skip("only three bias types are supported by ck-tiled!")
+
+    if dtype is torch.bfloat16:
+        pytest.skip("bfloat16 is currently not supported by ck-tiled!")
+
+    if not (k == kv and (kv == 64 or kv == 128)):
+        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
+
+    if packed and not (k == kv and q_len == kv_len):
+        pytest.skip(
+            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
+        )
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
+        pytest.skip("BMK incompatible with this bias")
+
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+    )
+
+    if packed:
+        c = torch.stack([query, key, value], 2)
+        if fmt == "BMK":
+            # bm3hk -> 3bhmk -> 3Bmk
+            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
+            query, key, value = c[0], c[1], c[2]
+            # Re-create bias in the right format
+            attn_bias = create_attn_bias(
+                bias_type=bias_type,
+                batch_size=batch_size,
+                num_heads=h,
+                q_len=q_len,
+                kv_len=kv_len,
+                device=device,
+                dtype=dtype,
+                requires_grad=False,
+                fmt=fmt,
+                op=op,
+            )
+        else:
+            # bm3hk -> 3 x bmhk
+            query, key, value = xformers.ops.unbind(c, 2)
+        assert not query.is_contiguous()
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 0a7174ad8..bcd11b388 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 0a7174ad864cda7f59c1e8f5ccefee3359c88978
+Subproject commit bcd11b3880733d3a5603b04ff8f5e1fa5876293f
diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
new file mode 100644
index 000000000..a008bc222
--- /dev/null
+++ b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
@@ -0,0 +1,315 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import itertools
+import random
+from functools import partial
+
+import torch
+from torch.utils import benchmark
+from xformers.benchmarks.utils import benchmark_main_helper
+
+import xformers.ops
+import xformers.ops.fmha as fmha
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    bias_requires_grad: bool = False,
+):
+    NoneType = type(None)
+    if bias_type is NoneType:
+        return None
+    if bias_type is torch.Tensor:
+        attn_bias = torch.randn((1, 1, q_len, kv_len), device=device, dtype=dtype)
+        return attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return bias_type()
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def ref_attention_bmk(q, k, v, attn_bias=None, p=0.0):
+    if isinstance(attn_bias, xformers.ops.AttentionMask):
+        attn_bias = (
+            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
+            .to(q)
+            .squeeze()
+        )
+    q = q * (1.0 / q.shape[-1] ** 0.5)
+    if attn_bias is None:
+        attn = q @ k.transpose(-2, -1)
+    else:
+        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
+        # but faster, and is what is used in PyTorch now
+        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
+    attn = attn.softmax(-1)
+    if p > 0:
+        attn = torch.nn.functional.dropout(attn, p=p)
+    return attn @ v
+
+
+def ref_attention(q, k, v, attn_bias, p=0.0):
+    assert q.ndim == 4
+    B, M, H, K = q.shape
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, torch.Tensor):
+        attn_bias = attn_bias.reshape(B * H, M, M)
+    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias, p)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+min_run_time = 0.5
+device = torch.device("cuda")
+
+NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
+SHAPES = [
+    # ViT
+    ##(384, 197, 1, 88),
+    ##(384, 197, 1, 80),
+    ##(384, 197, 1, 64),
+    ##(1024, 197, 1, 88),
+    ##(1024, 197, 1, 80),
+    (1024, 197, 1, 64),
+    # ViT-Huge
+    ##(32 * 16, 197, 1, 80),
+    ##(32, 197, 16, 80),
+    ##(32, 197, 16, 64),
+    (32, 197, 16, 128),
+    # ViT-Giant
+    ##(16 * 16, 197, 1, 88),
+    ##(16, 197, 16, 88),
+    (16, 197, 16, 64),
+    (16, 197, 16, 128),
+    # FB models
+    (1024, 82, 8, 64),
+    (150, 256, 16, 64),
+    (64, 256, 12, 64),
+    # Stable diffusion (https://github.com/huggingface/diffusers/pull/532)
+    ##(1, 4096, 16, 40),  # 512x512
+    ##(1, 16384, 16, 40),  # 1024x1024
+    ##(1, 4096, 16, 80),
+    #(1, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
+    # + bs4
+    ##(4, 4096, 16, 40),
+    #(4, 16384, 16, 40),  // disabled on MI250 due to big memory requirement
+    ##(4, 4096, 16, 80),
+    #(4, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
+    # ParlAI model
+    #(256, 4096, 16, 64), // disabled on MI250 due to big memory requirement
+    # Zetta B M H K
+    (8, 2048, 20, 128),
+    # LLaMa 70b - mp=8/16
+    *sorted(itertools.product([1, 2], [2048, 4096, 8192], [4, 8], [128])),
+    *sorted(
+        ##itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128, 160, 256])
+        ## disabled K/Kv bigger than 128
+        itertools.product([16], [128, 512, 1024], [16], [64, 128])
+    ),
+]
+
+OPS = [
+    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
+    #(xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
+    # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
+    # and its performance varies a lot between runs.
+    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
+]
+
+
+def product_dict(**kwargs):
+    keys = kwargs.keys()
+    vals = kwargs.values()
+    for instance in itertools.product(*vals):
+        yield dict(zip(keys, instance))
+
+
+CASES = list(
+    product_dict(
+        shape=SHAPES,
+        num_threads=NUM_THREADS,
+        dropout_p=[0.0],
+        attn_bias_cfg=[(type(None), False)],
+        dtype=[torch.half],
+    )
+)
+
+# Add more cases with some variations
+for c in CASES.copy():
+    c = c.copy()
+    c.update(
+        random.Random(str(c["shape"])).choice(
+            [
+                ##{"dropout_p": 0.3},
+                {"attn_bias_cfg": (torch.Tensor, False)},
+                ##{"attn_bias_cfg": (torch.Tensor, True)},
+                ##{"dtype": torch.bfloat16},
+                ##{"dtype": torch.float},
+            ]
+        )
+    )
+    CASES.append(c)
+
+
+def create_tensors(shape, dtype, requires_grad=False):
+    B, M, H, K = shape
+    qkv = torch.rand(
+        [B, M, 3, H, K], device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    q, k, v = xformers.ops.unbind(qkv, 2)
+    return qkv, q, k, v
+
+def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+    B, M, H, K = shape
+    _, q, k, v = create_tensors(shape, dtype)
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    if attn_bias_requires_grad:
+        return
+    bias = create_attn_bias(
+        attn_bias_type,
+        batch_size=B,
+        num_heads=H,
+        q_len=M,
+        kv_len=M,
+        device=device,
+        dtype=dtype,
+        bias_requires_grad=attn_bias_requires_grad,
+    )
+    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}"
+    )
+
+    has_run = False
+    for fw_op, bw_op in OPS:
+        if not fw_op.supports(inp):
+            continue
+
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias, p)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": inp.attn_bias,
+                "p": dropout_p,
+                "fn": partial(
+                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
+                ),
+            },
+            label=f"attention (attn_bias={attn_bias_type})",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        has_run = True
+
+    if not has_run:
+        return
+
+    yield benchmark.Timer(
+        stmt="fn(q, k, v, attn_bias, p)",
+        globals={
+            "q": q,
+            "k": k,
+            "v": v,
+            "attn_bias": inp.attn_bias,
+            "p": dropout_p,
+            "fn": ref_attention,
+        },
+        label=f"attention (attn_bias={attn_bias_type})",
+        description="eager",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+
+
+def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+    B, M, H, K = shape
+    _, q, k, v = create_tensors(shape, dtype, requires_grad=True)
+
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    bias = create_attn_bias(
+        attn_bias_type,
+        batch_size=B,
+        num_heads=H,
+        q_len=M,
+        kv_len=M,
+        device=device,
+        dtype=dtype,
+        bias_requires_grad=attn_bias_requires_grad,
+    )
+    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}, BiasGrad={attn_bias_requires_grad}"
+    )
+
+    has_run = False
+    for fw_op, bw_op in OPS:
+        if not fw_op.supports(inp) or not bw_op.supports(inp):
+            continue
+        has_run = True
+        out = xformers.ops.memory_efficient_attention(
+            inp.query, inp.key, inp.value, inp.attn_bias, inp.p, op=(fw_op, bw_op)
+        )
+        grad_benchmark = torch.ones_like(q)
+
+        yield benchmark.Timer(
+            stmt="out.backward(grad, retain_graph=True)",
+            globals={
+                "out": out,
+                "grad": grad_benchmark,
+            },
+            label=f"attention backward (attn_bias={attn_bias_type})",
+            description=bw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        del out
+
+    if not has_run:
+        return
+    yield benchmark.Timer(
+        stmt="out.backward(grad, retain_graph=True)",
+        globals={
+            "out": ref_attention(q, k, v, inp.attn_bias, dropout_p),
+            "grad": grad_benchmark,
+        },
+        label=f"attention backward (attn_bias={attn_bias_type})",
+        description="vanilla",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+
+benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
+##benchmark_main_helper(mem_eff_attention_bw, CASES, min_run_time=min_run_time)
diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
new file mode 100644
index 000000000..0aea1b7c4
--- /dev/null
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import itertools
+from functools import partial
+
+import torch
+from torch.utils import benchmark
+from utils import benchmark_main_helper
+
+import xformers.ops
+import xformers.ops.fmha as fmha
+import xformers.profiler.slow_ops_profiler
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+# Run with
+#  python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py --omit-baselines --quiet
+# The baselines for these benchmarks are really slow because there is
+# so much padding in the inputs, so there is no point running them.
+
+
+def ref_attention_bmk(q, k, v, attn_bias=None):
+    if isinstance(attn_bias, xformers.ops.AttentionMask):
+        attn_bias = (
+            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
+            .to(q)
+            .squeeze()
+        )
+    q = q * (1.0 / q.shape[-1] ** 0.5)
+    if attn_bias is None:
+        attn = q @ k.transpose(-2, -1)
+    else:
+        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
+        # but faster, and is what is used in PyTorch now
+        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
+    attn = attn.softmax(-1)
+    return attn @ v
+
+
+def ref_attention(q, k, v, attn_bias):
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+min_run_time = 0.5
+device = torch.device("cuda")
+
+NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
+
+OPS = [
+    xformers.ops.fmha.ck.FwOp,
+    ##xformers.ops.fmha.ck_decoder.FwOp
+]
+
+KV_SHAPES = [
+    # list of n_keys, padding_length, batchsize
+    (2, 64, 3),
+    (32, 1024, 500),  
+    (1000, 1024, 2),
+    (8000, 8192, 1),
+    (240, 256, 32),
+    (2048, 2 * 1024, 4),
+    (4096 * 2, 8 * 1024, 1),
+]
+
+N_HEADS = [8, 16, 64]
+
+
+def product_dict(**kwargs):
+    keys = kwargs.keys()
+    vals = kwargs.values()
+    for instance in itertools.product(*vals):
+        yield dict(zip(keys, instance))
+
+
+CASES = list(
+    product_dict(
+        kv_shape=KV_SHAPES,
+        n_heads=N_HEADS,
+        num_threads=NUM_THREADS,
+        multiquery=[True, False],
+    )
+)
+
+def get_memory_traffic(op, q, k, v, bias):
+    # mem_size = ( batch_size * seq_len * 1 * dim_per_head * 2 (K/V) + 
+    #              batch_size * 1 * num_heads * dim_per_head (Q) + 
+    #              batch_size * seq_len * num_heads * dim_per_head (attn_output) ) * bytes_per_element
+    out = xformers.ops.memory_efficient_attention_forward(q, k, v, bias, op=op)
+    dtype = q.dtype
+    multiquery = k.stride(2) == 0
+    n_heads = q.shape[-2]
+    dim_per_head = q.shape[-1]
+    kv_seqlen = bias.k_seqinfo.seqlen_py
+    bytes_per_element = 4 if dtype is torch.float32 else 2 if dtype in (torch.float16, torch.bfloat16) else None
+    mem_size = 0
+    mem_size += q.numel() * bytes_per_element # Q
+    for s in kv_seqlen: # len(kv_seqlen) == batch_size
+        mem_size += s * (1 if multiquery else n_heads) * dim_per_head * bytes_per_element * 2 # K, V
+    mem_size += out.numel() * bytes_per_element # attn_output
+    return mem_size
+
+def mem_eff_attention_decoder(
+    kv_shape, n_heads: int, num_threads: int, multiquery: bool
+):
+    n_keys, padding, B = kv_shape
+    torch.manual_seed(42)
+    k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
+    K = 128
+    dtype = torch.float16
+    q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
+    if multiquery:
+        k = torch.rand(
+            1, B * padding, 1, K, device=device, dtype=dtype
+        ).expand(1, B * padding, n_heads, K)
+        v = torch.rand(
+            1, B * padding, 1, K, device=device, dtype=dtype
+        ).expand(1, B * padding, n_heads, K)
+    else:
+        k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
+        v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
+
+    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+        q_seqlen=[1] * B,
+        kv_seqlen=k_seqlen,
+    )
+
+    sub_label = f"{B}batch-{k_seqlen[0]}keys-{n_heads}heads"
+    if multiquery:
+        sub_label += "-mq"
+
+    has_run = False
+
+    for fw_op in OPS:
+        inp = fmha.Inputs(q, k, v, attn_bias=bias)
+        if (skip_reasons := fw_op.not_supported_reasons(inp)):
+            print(f"Skip benchmark: {skip_reasons=}")
+            continue
+
+        fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
+
+        yield benchmark.Timer(
+            stmt=f"fn(q, k, v, attn_bias)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": bias,
+                "fn": fn,
+            },
+            label="attention",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            fn(q, k, v, bias)
+        yield benchmark.Timer(
+            stmt="graph.replay()",
+            globals={
+                "graph": graph,
+            },
+            label="cuda graphed attention",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+
+        has_run = True
+
+    if not has_run:
+        return
+
+    RUN_BASELINES = False
+    if RUN_BASELINES:
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": bias,
+                "fn": ref_attention,
+            },
+            label="attention",
+            description="eager",
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+
+
+benchmark_main_helper(mem_eff_attention_decoder, CASES, min_run_time=min_run_time)

From 95aed6da9f6f8c81e760ce7fa6790583c5b146c7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 30 Nov 2023 17:20:35 +0000
Subject: [PATCH 248/641] Update to benchmark_mem_eff_attention_ck_tiled.py

---
 xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
index a008bc222..e9381e88a 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
@@ -83,14 +83,14 @@ def T(t):
     # ViT
     ##(384, 197, 1, 88),
     ##(384, 197, 1, 80),
-    ##(384, 197, 1, 64),
+    (384, 197, 1, 64),
     ##(1024, 197, 1, 88),
     ##(1024, 197, 1, 80),
     (1024, 197, 1, 64),
     # ViT-Huge
     ##(32 * 16, 197, 1, 80),
     ##(32, 197, 16, 80),
-    ##(32, 197, 16, 64),
+    (32, 197, 16, 64),
     (32, 197, 16, 128),
     # ViT-Giant
     ##(16 * 16, 197, 1, 88),

From 25dbca9da6b2e22e239689634e7c01377bea3664 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 1 Dec 2023 23:54:54 +0000
Subject: [PATCH 249/641] Synchronize with latest feature update from
 feature/fmah-pad-support branch

---
 .gitmodules                                   |   1 +
 third_party/composable_kernel_tiled           |   2 +-
 .../attention_forward_generic_ck_tiled.cpp    | 732 +++++++++---------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 186 +++--
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  63 +-
 .../hip_fmha/ck_tiled_fmha_definitions.h      |  31 +
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 521 ++++++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 108 ++-
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  63 +-
 ...led_fmha_batched_infer_fp16_masktype_0.cpp |   7 -
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |   7 +
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   7 +
 ...led_fmha_batched_infer_fp16_masktype_1.cpp |   7 -
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |   7 +
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   7 +
 ...led_fmha_batched_infer_fp16_masktype_2.cpp |   7 -
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |   7 +
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   7 +
 ...led_fmha_grouped_infer_fp16_masktype_0.cpp |   7 -
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |   7 +
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   7 +
 ...led_fmha_grouped_infer_fp16_masktype_1.cpp |   7 -
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |   7 +
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   7 +
 ...led_fmha_grouped_infer_fp16_masktype_2.cpp |   7 -
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |   7 +
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   7 +
 27 files changed, 1070 insertions(+), 763 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp

diff --git a/.gitmodules b/.gitmodules
index bbbf0f197..bf2678053 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -11,3 +11,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/asroy/ck_tile
+        branch = feature/fmha-pad-support
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index bcd11b388..08d9e56f2 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit bcd11b3880733d3a5603b04ff8f5e1fa5876293f
+Subproject commit 08d9e56f2e321016934fb0c44673af4c0754171f
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 8961bb4ea..0c87daa97 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -44,11 +44,10 @@ namespace {
   (Mode BMHK) With all the heads having the same seqlen
   (Mode 1MHK) `batch=1` with all tokens across batches concatenated
 */
-std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
-efficient_attention_forward_ck(
-    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
-    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
-    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward_ck(
+    const at::Tensor& query,               // [b, seqlen, num_heads_q, K]
+    const at::Tensor& key,                 // [b, seqlen, num_heads_kv, K]
+    const at::Tensor& value,               // [b, seqlen, num_heads_kv, Kv]
     const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
@@ -62,372 +61,381 @@ efficient_attention_forward_ck(
     bool compute_logsumexp,
     int64_t custom_mask_type,
     c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k) {
-  TORCH_CHECK(query.dim() == 4);
-  TORCH_CHECK(key.dim() == 4);
-  TORCH_CHECK(value.dim() == 4);
-
-  // Batch sizes
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // Sequence length
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) % key.size(2) == 0);
-  TORCH_CHECK(key.size(2) == value.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-
-  TORCH_CHECK(query.scalar_type() == key.scalar_type());
-  TORCH_CHECK(query.scalar_type() == value.scalar_type());
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-  };
-
-  // last dim is contiguous, device is kCUDA
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t Hq = query.size(-2);
-  int64_t Hkv = key.size(-2);
-  int64_t K = query.size(-1);
-  int64_t Kv = value.size(-1);
-
-  auto opts = query.options();
-
-  at::Tensor logsumexp;
-
-  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
-
-  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  if (use_dropout) {
-    /*
-    at::PhiloxCudaState rng_engine_inputs;
-    at::CUDAGeneratorImpl* gen =
-        at::get_generator_or_default<at::CUDAGeneratorImpl>(
-            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    // if using dropout, we produce 1 random number for each element of the
-    // attention tensor
-    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
-
-    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-    philox_seed = std::get<0>(seeds);
-    philox_offset = std::get<1>(seeds);
-    */
-    throw std::runtime_error(
-        "drop-out is currently not implemented by ck-tiled!");
-  }
-
-  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
+    const c10::optional<at::Tensor>& seqlen_k)
+{
+    TORCH_CHECK(query.dim() == 4);
+    TORCH_CHECK(key.dim() == 4);
+    TORCH_CHECK(value.dim() == 4);
+
+    // Batch sizes
+    TORCH_CHECK(query.size(0) == key.size(0));
+    TORCH_CHECK(query.size(0) == value.size(0));
+
+    // Sequence length
+    TORCH_CHECK(key.size(1) == value.size(1));
+
+    // Num heads
+    TORCH_CHECK(query.size(2) % key.size(2) == 0);
+    TORCH_CHECK(key.size(2) == value.size(2));
+
+    // Embedding per head
+    TORCH_CHECK(query.size(3) == key.size(3));
+
+    TORCH_CHECK(query.scalar_type() == key.scalar_type());
+    TORCH_CHECK(query.scalar_type() == value.scalar_type());
+
+    TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+    if(seqstart_q.has_value())
+    {
+        TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
+        TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+        TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+        TORCH_CHECK(max_seqlen_q_.has_value());
+    };
+
+    // last dim is contiguous, device is kCUDA
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+    // at::cuda::CUDAGuard device_guard(query.device());
+    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+    int64_t B   = query.size(0);
+    int64_t M   = query.size(1);
+    int64_t N   = key.size(1);
+    int64_t Hq  = query.size(-2);
+    int64_t Hkv = key.size(-2);
+    int64_t K   = query.size(-1);
+    int64_t Kv  = value.size(-1);
+
+    auto opts = query.options();
+
+    at::Tensor logsumexp;
+
+    at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
 
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(0)),
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.custom_mask_type = custom_mask_type;
-
-    if (p.custom_mask_type != 0)
-      throw std::runtime_error(
-          "causal mask-type is currently not supported by ck-tiled!");
-
-    p.use_dropout = use_dropout;
-    p.philox_seed = philox_seed;
-    p.philox_offset = philox_offset;
-    p.compute_logsumexp = compute_logsumexp;
-
-    // the following parameters are only used by training forward
-    if (p.use_dropout) {
-      // p.dropout_prob = static_cast<float>(dropout_p);
-      throw std::runtime_error(
-          "drop-out is currently not implemented by ck-tiled!");
-    } else
-      p.dropout_prob = 0.0f;
-
-    if (p.compute_logsumexp) {
-      /*
-      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
-      p.logsumexp_ptr = logsumexp.data_ptr();
-      */
-      throw std::runtime_error(
-          "compute logsumexp is currently not implemented by ck-tiled!");
-    } else
-      p.logsumexp_ptr = nullptr;
-  };
-
-  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
+    const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+    int64_t philox_seed;
+    int64_t philox_offset;
+
+    if(use_dropout)
+    {
+        /*
+        at::PhiloxCudaState rng_engine_inputs;
+        at::CUDAGeneratorImpl* gen =
+            at::get_generator_or_default<at::CUDAGeneratorImpl>(
+                c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        // if using dropout, we produce 1 random number for each element of the
+        // attention tensor
+        rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
+
+        const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+        philox_seed = std::get<0>(seeds);
+        philox_offset = std::get<1>(seeds);
+        */
+        throw std::runtime_error("drop-out is currently not implemented by ck-tiled!");
     }
 
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.custom_mask_type = custom_mask_type;
-
-    if (p.custom_mask_type != 0)
-      throw std::runtime_error(
-          "causal mask-type is currently not supported by ck-tiled!");
-
-    // max_seqlen_q is used to create logsumexp tensor
-    p.max_seqlen_q = *max_seqlen_q_;
-
-    at::Tensor dev_seqstart_q =
-        at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-    at::Tensor dev_seqstart_k =
-        at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-    at::Tensor dev_seqlen_k;
-
-    p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
-    HIP_CALL_CHECK(hipMemcpyAsync(
-        p.seqstart_q_dev_ptr,
-        seqstart_q->data_ptr(),
-        (p.num_batches + 1) * sizeof(int),
-        hipMemcpyHostToDevice,
-        stream));
-
-    p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
-    HIP_CALL_CHECK(hipMemcpyAsync(
-        p.seqstart_k_dev_ptr,
-        seqstart_k->data_ptr(),
-        (p.num_batches + 1) * sizeof(int),
-        hipMemcpyHostToDevice,
-        stream));
-
-    if (seqlen_k.has_value()) {
-      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-      TORCH_CHECK(seqlen_k->dim() == 1);
-      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
-
-      dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
-
-      p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
-
-      HIP_CALL_CHECK(hipMemcpyAsync(
-          p.seqlen_k_dev_ptr,
-          seqstart_k->data_ptr(),
-          p.num_batches * sizeof(int),
-          hipMemcpyHostToDevice,
-          stream));
-    } else
-      p.seqlen_k_dev_ptr = nullptr;
-
-    p.use_dropout = use_dropout;
-    p.philox_seed = philox_seed;
-    p.philox_offset = philox_offset;
-    p.compute_logsumexp = compute_logsumexp;
-
-    // the following parameters are only used by training forward
-    if (p.use_dropout) {
-      // p.dropout_prob = static_cast<float>(dropout_p);
-      throw std::runtime_error(
-          "drop-out is currently not implemented by ck-tiled!");
-    } else
-      p.dropout_prob = 0.0f;
-
-    if (p.compute_logsumexp) {
-      /*
-      logsumexp = at::empty(
-          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
-      char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-      for (int i = 0; i < p.num_batches; i++) {
-        size_t tmp_logsumexp_offset = get_size_in_bytes(
-            static_cast<size_t>(i) * Hq * p.max_seqlen_q,
-            logsumexp.scalar_type());
-        p.logsumexp_ptrs.push_back(
-            reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-      };
-      */
-      throw std::runtime_error(
-          "compute logsumexp is currently not implemented by ck-tiled!");
+    auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+        p.B   = B;
+        p.M   = M;
+        p.N   = N;
+        p.Hq  = Hq;
+        p.Hkv = Hkv;
+        p.K   = K;
+        p.Kv  = Kv;
+
+        if(scale.has_value())
+        {
+            p.scale = float(*scale);
+        }
+        else
+        {
+            p.scale = float(1.0 / std::sqrt(float(K)));
+        }
+
+        p.q_ptr   = query.data_ptr();
+        p.k_ptr   = key.data_ptr();
+        p.v_ptr   = value.data_ptr();
+        p.out_ptr = out.data_ptr();
+
+        p.q_strides   = {static_cast<int>(query.stride(0)),
+                       static_cast<int>(query.stride(1)),
+                       static_cast<int>(query.stride(2)),
+                       static_cast<int>(query.stride(3))};
+        p.k_strides   = {static_cast<int>(key.stride(0)),
+                       static_cast<int>(key.stride(1)),
+                       static_cast<int>(key.stride(2)),
+                       static_cast<int>(key.stride(3))};
+        p.v_strides   = {static_cast<int>(value.stride(0)),
+                       static_cast<int>(value.stride(1)),
+                       static_cast<int>(value.stride(2)),
+                       static_cast<int>(value.stride(3))};
+        p.out_strides = {static_cast<int>(out.stride(0)),
+                         static_cast<int>(out.stride(1)),
+                         static_cast<int>(out.stride(2)),
+                         static_cast<int>(out.stride(3))};
+
+        if(bias.has_value())
+        {
+            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+            p.has_attn_bias = true;
+            p.attn_bias_ptr = bias->data_ptr();
+
+            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
+                                   static_cast<int>(bias_4d_view.stride(1)),
+                                   static_cast<int>(bias_4d_view.stride(2)),
+                                   static_cast<int>(bias_4d_view.stride(3))};
+        }
+        else
+            p.has_attn_bias = false;
+
+        p.custom_mask_type = custom_mask_type;
+
+        p.use_dropout       = use_dropout;
+        p.philox_seed       = philox_seed;
+        p.philox_offset     = philox_offset;
+        p.compute_logsumexp = compute_logsumexp;
+
+        // the following parameters are only used by training forward
+        if(p.use_dropout)
+        {
+            // p.dropout_prob = static_cast<float>(dropout_p);
+            throw std::runtime_error("drop-out is currently not implemented by ck-tiled!");
+        }
+        else
+            p.dropout_prob = 0.0f;
+
+        if(p.compute_logsumexp)
+        {
+            /*
+            logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
+            p.logsumexp_ptr = logsumexp.data_ptr();
+            */
+            throw std::runtime_error("compute logsumexp is currently not implemented by ck-tiled!");
+        }
+        else
+            p.logsumexp_ptr = nullptr;
     };
-  };
-
-  auto inDataType = query.scalar_type();
-
-  if (!seqstart_q.has_value()) { // input is batched
-    BatchedForwardParams batched_forward_params;
-
-    set_batched_forward_params(batched_forward_params);
-
-    if (!batched_forward_params.use_dropout &&
-        !batched_forward_params.compute_logsumexp) {
-      if (inDataType == at::ScalarType::Half) {
-        batched_infer_fp16(batched_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        // batched_infer_bp16(batched_forward_params, stream);
-        throw std::runtime_error("input data-type is not supported!");
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else {
-      /*
-      if (inDataType == at::ScalarType::Half) {
-        batched_forward_fp16(batched_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_forward_bp16(batched_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-      */
-      throw std::runtime_error(
-          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
+
+    auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+        p.num_batches = seqstart_q->size(0) - 1;
+        p.M           = M;
+        p.N           = N;
+        p.Hq          = Hq;
+        p.Hkv         = Hkv;
+        p.K           = K;
+        p.Kv          = Kv;
+
+        if(scale.has_value())
+        {
+            p.scale = float(*scale);
+        }
+        else
+        {
+            p.scale = float(1.0 / std::sqrt(float(K)));
+        }
+
+        p.q_ptr   = query.data_ptr();
+        p.k_ptr   = key.data_ptr();
+        p.v_ptr   = value.data_ptr();
+        p.out_ptr = out.data_ptr();
+
+        p.q_strides   = {static_cast<int>(query.stride(1)),
+                       static_cast<int>(query.stride(2)),
+                       static_cast<int>(query.stride(3))};
+        p.k_strides   = {static_cast<int>(key.stride(1)),
+                       static_cast<int>(key.stride(2)),
+                       static_cast<int>(key.stride(3))};
+        p.v_strides   = {static_cast<int>(value.stride(1)),
+                       static_cast<int>(value.stride(2)),
+                       static_cast<int>(value.stride(3))};
+        p.out_strides = {static_cast<int>(out.stride(1)),
+                         static_cast<int>(out.stride(2)),
+                         static_cast<int>(out.stride(3))};
+
+        if(bias.has_value())
+        {
+            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+            p.has_attn_bias = true;
+            p.attn_bias_ptr = bias->data_ptr();
+
+            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
+                                   static_cast<int>(bias_4d_view.stride(1)),
+                                   static_cast<int>(bias_4d_view.stride(2)),
+                                   static_cast<int>(bias_4d_view.stride(3))};
+        }
+        else
+            p.has_attn_bias = false;
+
+        p.custom_mask_type = custom_mask_type;
+
+        // max_seqlen_q is used to create logsumexp tensor
+        p.max_seqlen_q = *max_seqlen_q_;
+
+        at::Tensor dev_seqstart_q = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+        at::Tensor dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+        at::Tensor dev_seqlen_k;
+
+        p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
+        HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_q_dev_ptr,
+                                      seqstart_q->data_ptr(),
+                                      (p.num_batches + 1) * sizeof(int),
+                                      hipMemcpyHostToDevice,
+                                      stream));
+
+        p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
+        HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_k_dev_ptr,
+                                      seqstart_k->data_ptr(),
+                                      (p.num_batches + 1) * sizeof(int),
+                                      hipMemcpyHostToDevice,
+                                      stream));
+
+        if(seqlen_k.has_value())
+        {
+            TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+            TORCH_CHECK(seqlen_k->dim() == 1);
+            TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+            CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
+
+            dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
+
+            p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
+
+            HIP_CALL_CHECK(hipMemcpyAsync(p.seqlen_k_dev_ptr,
+                                          seqstart_k->data_ptr(),
+                                          p.num_batches * sizeof(int),
+                                          hipMemcpyHostToDevice,
+                                          stream));
+        }
+        else
+            p.seqlen_k_dev_ptr = nullptr;
+
+        p.use_dropout       = use_dropout;
+        p.philox_seed       = philox_seed;
+        p.philox_offset     = philox_offset;
+        p.compute_logsumexp = compute_logsumexp;
+
+        // the following parameters are only used by training forward
+        if(p.use_dropout)
+        {
+            // p.dropout_prob = static_cast<float>(dropout_p);
+            throw std::runtime_error("drop-out is currently not implemented by ck-tiled!");
+        }
+        else
+            p.dropout_prob = 0.0f;
+
+        if(p.compute_logsumexp)
+        {
+            /*
+            logsumexp = at::empty(
+                {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+            char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+
+            for (int i = 0; i < p.num_batches; i++) {
+              size_t tmp_logsumexp_offset = get_size_in_bytes(
+                  static_cast<size_t>(i) * Hq * p.max_seqlen_q,
+                  logsumexp.scalar_type());
+              p.logsumexp_ptrs.push_back(
+                  reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+            };
+            */
+            throw std::runtime_error("compute logsumexp is currently not implemented by ck-tiled!");
+        };
     };
-  } else { // input is grouped
-    GroupedForwardParams grouped_forward_params;
-
-    set_grouped_forward_params(grouped_forward_params);
-
-    if (!grouped_forward_params.use_dropout &&
-        !grouped_forward_params.compute_logsumexp) {
-      if (inDataType == at::ScalarType::Half) {
-        grouped_infer_fp16(grouped_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        // grouped_infer_bp16(grouped_forward_params, stream);
-        throw std::runtime_error("input data-type is not supported!");
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else {
-      /*
-      if (inDataType == at::ScalarType::Half) {
-        grouped_forward_fp16(grouped_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_forward_bp16(grouped_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-      */
-      throw std::runtime_error(
-          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
+
+    auto inDataType = query.scalar_type();
+
+    if(!seqstart_q.has_value())
+    { // input is batched
+        BatchedForwardParams batched_forward_params;
+
+        set_batched_forward_params(batched_forward_params);
+
+        if(!batched_forward_params.use_dropout && !batched_forward_params.compute_logsumexp)
+        {
+            if(inDataType == at::ScalarType::Half)
+            {
+                batched_infer_fp16(batched_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                // batched_infer_bp16(batched_forward_params, stream);
+                throw std::runtime_error("input data-type is not supported!");
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+        }
+        else
+        {
+            /*
+            if (inDataType == at::ScalarType::Half) {
+              batched_forward_fp16(batched_forward_params, stream);
+            } else if (inDataType == at::ScalarType::BFloat16) {
+              batched_forward_bp16(batched_forward_params, stream);
+            } else
+              throw std::runtime_error("input data-type is not supported!");
+            */
+            throw std::runtime_error(
+                "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
+        };
+    }
+    else
+    { // input is grouped
+        GroupedForwardParams grouped_forward_params;
+
+        set_grouped_forward_params(grouped_forward_params);
+
+        if(!grouped_forward_params.use_dropout && !grouped_forward_params.compute_logsumexp)
+        {
+            if(inDataType == at::ScalarType::Half)
+            {
+                grouped_infer_fp16(grouped_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                // grouped_infer_bp16(grouped_forward_params, stream);
+                throw std::runtime_error("input data-type is not supported!");
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+        }
+        else
+        {
+            /*
+            if (inDataType == at::ScalarType::Half) {
+              grouped_forward_fp16(grouped_forward_params, stream);
+            } else if (inDataType == at::ScalarType::BFloat16) {
+              grouped_forward_bp16(grouped_forward_params, stream);
+            } else
+              throw std::runtime_error("input data-type is not supported!");
+            */
+            throw std::runtime_error(
+                "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
+        };
     };
-  };
 
-  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
+    return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
-      TORCH_FN(efficient_attention_forward_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m)
+{
+    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
+           TORCH_FN(efficient_attention_forward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 3492f61f3..5fd39201e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -22,8 +22,9 @@
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
+#include "ck_tiled_fmha_definitions.h"
 
-template <typename scalar_t, int32_t custom_mask_type>
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 struct batched_infer_masktype_attnbias_dispatched
 {
     using QDataType           = scalar_t;
@@ -38,6 +39,9 @@ struct batched_infer_masktype_attnbias_dispatched
 
     using VLayout = ck::tensor_layout::gemm::RowMajor;
 
+    static constexpr auto masktype = static_cast<CausalMaskType>(custom_mask_type);
+    using FmhaCausalMask           = typename CausalMaskPredicate<masktype>::predicate;
+
     using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
     using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
     using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
@@ -77,68 +81,64 @@ struct batched_infer_masktype_attnbias_dispatched
     }()
 #endif
 
+    template <bool kM0NeedPadding, bool kN0K1NeedPadding, typename FmhaShape>
+    using FmhaPipelineProblemTemp =
+        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                          KDataType,
+                                                          VDataType,
+                                                          SaccDataType,
+                                                          SMPLComputeDataType,
+                                                          BiasDataType,
+                                                          PDataType,
+                                                          OaccDataType,
+                                                          ODataType,
+                                                          256, // BlockSize
+                                                          FmhaShape,
+                                                          false, // kIsGroupMode
+                                                          kM0NeedPadding,
+                                                          kN0K1NeedPadding,
+                                                          has_attn_bias,
+                                                          FmhaCausalMask>;
+
     static void Run(BatchedForwardParams& param, hipStream_t stream)
     {
         BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
             using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-            using FmhaPipelineProblem =
-                ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                                  KDataType,
-                                                                  VDataType,
-                                                                  SaccDataType,
-                                                                  SMPLComputeDataType,
-                                                                  BiasDataType,
-                                                                  PDataType,
-                                                                  OaccDataType,
-                                                                  ODataType,
-                                                                  256, // BlockSize
-                                                                  FmhaShape>;
-
-            using FmhaPipeline =
-                ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
 
             if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
             {
-                constexpr bool MNeedPadding = false;
-                constexpr bool NNeedPadding = false;
-                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
-                                                 FmhaPipeline,
-                                                 FmhaEpilogue,
-                                                 MNeedPadding,
-                                                 NNeedPadding>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<false, false, FmhaShape>;
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
                 RunWithKernel<FmhaKernel>(param, stream);
             }
             else if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 != 0)
             {
-                constexpr bool MNeedPadding = false;
-                constexpr bool NNeedPadding = true;
-                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
-                                                 FmhaPipeline,
-                                                 FmhaEpilogue,
-                                                 MNeedPadding,
-                                                 NNeedPadding>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<false, true, FmhaShape>;
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
                 RunWithKernel<FmhaKernel>(param, stream);
             }
             else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 == 0)
             {
-                constexpr bool MNeedPadding = true;
-                constexpr bool NNeedPadding = false;
-                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
-                                                 FmhaPipeline,
-                                                 FmhaEpilogue,
-                                                 MNeedPadding,
-                                                 NNeedPadding>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<true, false, FmhaShape>;
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
                 RunWithKernel<FmhaKernel>(param, stream);
             }
             else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 != 0)
             {
-                constexpr bool MNeedPadding = true;
-                constexpr bool NNeedPadding = true;
-                using FmhaKernel            = FmhaFwdKernel<FmhaTilePartitioner,
-                                                 FmhaPipeline,
-                                                 FmhaEpilogue,
-                                                 MNeedPadding,
-                                                 NNeedPadding>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<true, true, FmhaShape>;
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
                 RunWithKernel<FmhaKernel>(param, stream);
             };
         });
@@ -147,6 +147,67 @@ struct batched_infer_masktype_attnbias_dispatched
     template <typename FmhaKernel>
     static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream)
     {
+        const auto kargs = [&] {
+            if constexpr(FmhaKernel::kSupportsBias)
+            {
+                std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias;
+
+                bias = std::make_tuple(param.attn_bias_ptr,
+                                       param.attn_bias_strides[2],
+                                       param.attn_bias_strides[1],
+                                       param.attn_bias_strides[0]);
+
+                return FmhaKernel::MakeKargs(
+                    param.q_ptr,
+                    param.k_ptr,
+                    param.v_ptr,
+                    param.out_ptr,
+                    param.M,  // seqlen_q
+                    param.N,  // seqlen_k
+                    param.K,  // hdim_q
+                    param.Kv, // hdim_v
+                    param.scale,
+                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                    param.k_strides[1],
+                    param.v_strides[1],
+                    param.out_strides[1],
+                    param.q_strides[2], // q, k, v, out tensor head-dim stride
+                    param.k_strides[2],
+                    param.v_strides[2],
+                    param.out_strides[2],
+                    param.q_strides[0], // q, k, v, out tensor batch-dim stride
+                    param.k_strides[0],
+                    param.v_strides[0],
+                    param.out_strides[0],
+                    bias);
+            }
+            else
+            {
+                return FmhaKernel::MakeKargs(
+                    param.q_ptr,
+                    param.k_ptr,
+                    param.v_ptr,
+                    param.out_ptr,
+                    param.M,  // seqlen_q
+                    param.N,  // seqlen_k
+                    param.K,  // hdim_q
+                    param.Kv, // hdim_v
+                    param.scale,
+                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                    param.k_strides[1],
+                    param.v_strides[1],
+                    param.out_strides[1],
+                    param.q_strides[2], // q, k, v, out tensor head-dim stride
+                    param.k_strides[2],
+                    param.v_strides[2],
+                    param.out_strides[2],
+                    param.q_strides[0], // q, k, v, out tensor batch-dim stride
+                    param.k_strides[0],
+                    param.v_strides[0],
+                    param.out_strides[0]);
+            };
+        }();
+
         dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
         constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
 
@@ -154,45 +215,14 @@ struct batched_infer_masktype_attnbias_dispatched
         constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
         constexpr ck::index_t kBlockPerCu   = kWarpPerCu / kWarpPerBlock;
 
-        std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias;
-
-        if(param.has_attn_bias)
-            bias = std::make_tuple(param.attn_bias_ptr,
-                                   param.attn_bias_strides[2],
-                                   param.attn_bias_strides[1],
-                                   param.attn_bias_strides[0]);
-
-        auto kargs =
-            FmhaKernel::MakeKargs(param.q_ptr,
-                                  param.k_ptr,
-                                  param.v_ptr,
-                                  param.out_ptr,
-                                  param.M,  // seqlen_q
-                                  param.N,  // seqlen_k
-                                  param.K,  // hdim_q
-                                  param.Kv, // hdim_v
-                                  param.scale,
-                                  param.q_strides[1], // q, k, v, out tensor seq-dim stride
-                                  param.k_strides[1],
-                                  param.v_strides[1],
-                                  param.out_strides[1],
-                                  param.q_strides[2], // q, k, v, out tensor head-dim stride
-                                  param.k_strides[2],
-                                  param.v_strides[2],
-                                  param.out_strides[2],
-                                  param.q_strides[0], // q, k, v, out tensor batch-dim stride
-                                  param.k_strides[0],
-                                  param.v_strides[0],
-                                  param.out_strides[0],
-                                  bias);
-
         (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
             StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
     };
 };
 
-template <typename scalar_t, int32_t custom_mask_type>
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void run_batched_infer_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
 {
-    batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(param, stream);
+    batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+        param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index bb4fa6d91..6dc443a7f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -5,28 +5,43 @@
 #include "ck_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
-        param, stream);
-  else if (param.custom_mask_type == 1)
-    run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
-        param, stream);
-  else if (param.custom_mask_type == 2)
-    run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
-        param, stream);
-  else
-    throw std::runtime_error("Invalid custom_mask_type value");
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 1)
+            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 2)
+            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
new file mode 100644
index 000000000..b4cbdbce2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <ck/tile_program/block_tile/block_masking_specialization.hpp>
+
+enum struct CausalMaskType
+{
+    MaskDisabled,
+    MaskUpperTriangleFromTopLeft,
+    MaskUpperTriangleFromBottomRight
+};
+
+template <CausalMaskType type>
+struct CausalMaskPredicate;
+
+template <>
+struct CausalMaskPredicate<CausalMaskType::MaskDisabled>
+{
+    using predicate = ck::tile_program::block::MaskDisabledPredicate;
+};
+
+template <>
+struct CausalMaskPredicate<CausalMaskType::MaskUpperTriangleFromTopLeft>
+{
+    using predicate = ck::tile_program::block::MaskUpperTriangleFromTopLeftPredicate;
+};
+
+template <>
+struct CausalMaskPredicate<CausalMaskType::MaskUpperTriangleFromBottomRight>
+{
+    using predicate = ck::tile_program::block::MaskUpperTriangleFromBottomRightPredicate;
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index e2b048546..169458efe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -3,9 +3,9 @@
 #include <optional>
 #include <type_traits>
 
+#include "ck/utility/common_header.hpp"
 #include "ck/tensor/tensor_view.hpp"
 #include "ck/tile_program/tile/tile_window.hpp"
-#include "ck/utility/common_header.hpp"
 
 // S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] * K[seqlen_k, hdim_q]
 // S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
@@ -15,11 +15,7 @@
 
 #define C_LOG2E 1.44269504088896340736 // log2(e)
 
-template <typename TilePartitioner_,
-          typename FmhaPipeline_,
-          typename EpiloguePipeline_,
-          bool MNeedPadding,
-          bool NNeedPadding>
+template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdKernel
 {
     using TilePartitioner                   = ck::remove_cvref_t<TilePartitioner_>;
@@ -35,8 +31,58 @@ struct FmhaFwdKernel
 
     using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
 
-    struct KargsCommon
+    static constexpr bool kIsGroupMode     = FmhaPipeline::kIsGroupMode;
+    static constexpr bool kM0NeedPadding   = FmhaPipeline::kM0NeedPadding;
+    static constexpr bool kN0K1NeedPadding = FmhaPipeline::kN0K1NeedPadding;
+    static constexpr bool kSupportsBias    = FmhaPipeline::kSupportsBias;
+
+    using C0MatrixMask = ck::tile_program::block::C0MatrixMask_impl<
+        ck::remove_cvref_t<typename FmhaPipeline::BlockFmhaMask>>;
+
+    private:
+    struct EmptyKargs
     {
+    };
+
+    struct CommonKargs
+    {
+        __host__ constexpr CommonKargs(const void* q_ptr_,
+                                       const void* k_ptr_,
+                                       const void* v_ptr_,
+                                       void* o_ptr_,
+                                       ck::index_t seqlen_q_,
+                                       ck::index_t seqlen_k_,
+                                       ck::index_t hdim_q_,
+                                       ck::index_t hdim_v_,
+                                       float scale_,
+                                       ck::index_t stride_q_,
+                                       ck::index_t stride_k_,
+                                       ck::index_t stride_v_,
+                                       ck::index_t stride_o_,
+                                       ck::index_t nhead_stride_q_,
+                                       ck::index_t nhead_stride_k_,
+                                       ck::index_t nhead_stride_v_,
+                                       ck::index_t nhead_stride_o_)
+            : q_ptr{reinterpret_cast<const QDataType*>(q_ptr_)},
+              k_ptr{reinterpret_cast<const KDataType*>(k_ptr_)},
+              v_ptr{reinterpret_cast<const VDataType*>(v_ptr_)},
+              o_ptr{reinterpret_cast<ODataType*>(o_ptr_)},
+              seqlen_q{seqlen_q_},
+              seqlen_k{seqlen_k_},
+              hdim_q{hdim_q_},
+              hdim_v{hdim_v_},
+              scale{scale_},
+              stride_q{stride_q_},
+              stride_k{stride_k_},
+              stride_v{stride_v_},
+              stride_o{stride_o_},
+              nhead_stride_q{nhead_stride_q_},
+              nhead_stride_k{nhead_stride_k_},
+              nhead_stride_v{nhead_stride_v_},
+              nhead_stride_o{nhead_stride_o_}
+        {
+        }
+
         const QDataType* q_ptr;
         const KDataType* k_ptr;
         const VDataType* v_ptr;
@@ -58,85 +104,158 @@ struct FmhaFwdKernel
         ck::index_t nhead_stride_k;
         ck::index_t nhead_stride_v;
         ck::index_t nhead_stride_o;
+    };
 
-        // following attributes are optional
+    struct CommonBiasKargs
+    {
         const BiasDataType* bias_ptr  = nullptr;
         ck::index_t stride_bias       = 0;
         ck::index_t nhead_stride_bias = 0;
     };
 
-    struct KargsBatchMode : KargsCommon
+    struct BatchModeBiasKargs : CommonBiasKargs
     {
+        ck::index_t batch_stride_bias = 0;
+    };
+
+    struct BatchModeKargs : CommonKargs,
+                            std::conditional_t<kSupportsBias, BatchModeBiasKargs, EmptyKargs>
+    {
+        __host__ constexpr BatchModeKargs(const void* q_ptr_,
+                                          const void* k_ptr_,
+                                          const void* v_ptr_,
+                                          void* o_ptr_,
+                                          ck::index_t seqlen_q_,
+                                          ck::index_t seqlen_k_,
+                                          ck::index_t hdim_q_,
+                                          ck::index_t hdim_v_,
+                                          float scale_,
+                                          ck::index_t stride_q_,
+                                          ck::index_t stride_k_,
+                                          ck::index_t stride_v_,
+                                          ck::index_t stride_o_,
+                                          ck::index_t nhead_stride_q_,
+                                          ck::index_t nhead_stride_k_,
+                                          ck::index_t nhead_stride_v_,
+                                          ck::index_t nhead_stride_o_,
+                                          ck::index_t batch_stride_q_,
+                                          ck::index_t batch_stride_k_,
+                                          ck::index_t batch_stride_v_,
+                                          ck::index_t batch_stride_o_)
+            : CommonKargs{q_ptr_,
+                          k_ptr_,
+                          v_ptr_,
+                          o_ptr_,
+                          seqlen_q_,
+                          seqlen_k_,
+                          hdim_q_,
+                          hdim_v_,
+                          scale_,
+                          stride_q_,
+                          stride_k_,
+                          stride_v_,
+                          stride_o_,
+                          nhead_stride_q_,
+                          nhead_stride_k_,
+                          nhead_stride_v_,
+                          nhead_stride_o_},
+              batch_stride_q{batch_stride_q_},
+              batch_stride_k{batch_stride_k_},
+              batch_stride_v{batch_stride_v_},
+              batch_stride_o{batch_stride_o_}
+        {
+        }
+
         ck::index_t batch_stride_q;
         ck::index_t batch_stride_k;
         ck::index_t batch_stride_v;
         ck::index_t batch_stride_o;
-
-        // following attributes are optional
-        ck::index_t batch_stride_bias = 0;
     };
 
-    struct KargsGroupMode : KargsCommon
+    struct GroupModeKargs : CommonKargs,
+                            std::conditional_t<kSupportsBias, CommonBiasKargs, EmptyKargs>
     {
+        __host__ constexpr GroupModeKargs(const void* q_ptr_,
+                                          const void* k_ptr_,
+                                          const void* v_ptr_,
+                                          void* o_ptr_,
+                                          const void* seqstart_q_ptr_,
+                                          const void* seqstart_k_ptr_,
+                                          const void* seqlen_k_ptr_,
+                                          ck::index_t hdim_q_,
+                                          ck::index_t hdim_v_,
+                                          float scale_,
+                                          ck::index_t stride_q_,
+                                          ck::index_t stride_k_,
+                                          ck::index_t stride_v_,
+                                          ck::index_t stride_o_,
+                                          ck::index_t nhead_stride_q_,
+                                          ck::index_t nhead_stride_k_,
+                                          ck::index_t nhead_stride_v_,
+                                          ck::index_t nhead_stride_o_)
+            : CommonKargs{q_ptr_,
+                          k_ptr_,
+                          v_ptr_,
+                          o_ptr_,
+                          -1 /* will be updated inside the kernel */,
+                          -1 /* will be updated inside the kernel */,
+                          hdim_q_,
+                          hdim_v_,
+                          scale_,
+                          stride_q_,
+                          stride_k_,
+                          stride_v_,
+                          stride_o_,
+                          nhead_stride_q_,
+                          nhead_stride_k_,
+                          nhead_stride_v_,
+                          nhead_stride_o_},
+              seqstart_q_ptr{reinterpret_cast<const int32_t*>(seqstart_q_ptr_)},
+              seqstart_k_ptr{reinterpret_cast<const int32_t*>(seqstart_k_ptr_)},
+              seqlen_k_ptr{reinterpret_cast<const int32_t*>(seqlen_k_ptr_)}
+        {
+        }
+
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
         const int32_t* seqlen_k_ptr;
     };
 
-    __host__ static constexpr void InitKargsCommon(KargsCommon& kargs,
-                                                   const void* q_ptr,
-                                                   const void* k_ptr,
-                                                   const void* v_ptr,
-                                                   void* o_ptr,
-                                                   ck::index_t seqlen_q,
-                                                   ck::index_t seqlen_k,
-                                                   ck::index_t hdim_q,
-                                                   ck::index_t hdim_v,
-                                                   float scale,
-                                                   ck::index_t stride_q,
-                                                   ck::index_t stride_k,
-                                                   ck::index_t stride_v,
-                                                   ck::index_t stride_o,
-                                                   ck::index_t nhead_stride_q,
-                                                   ck::index_t nhead_stride_k,
-                                                   ck::index_t nhead_stride_v,
-                                                   ck::index_t nhead_stride_o)
-    {
-        kargs.q_ptr = reinterpret_cast<const QDataType*>(q_ptr);
-        kargs.k_ptr = reinterpret_cast<const KDataType*>(k_ptr);
-        kargs.v_ptr = reinterpret_cast<const VDataType*>(v_ptr);
-        kargs.o_ptr = reinterpret_cast<ODataType*>(o_ptr);
-
-        kargs.seqlen_q = seqlen_q;
-        kargs.seqlen_k = seqlen_k;
-        kargs.hdim_q   = hdim_q;
-        kargs.hdim_v   = hdim_v;
-
-        kargs.scale = scale;
-
-        kargs.stride_q = stride_q;
-        kargs.stride_k = stride_k;
-        kargs.stride_v = stride_v;
-        kargs.stride_o = stride_o;
-
-        kargs.nhead_stride_q = nhead_stride_q;
-        kargs.nhead_stride_k = nhead_stride_k;
-        kargs.nhead_stride_v = nhead_stride_v;
-        kargs.nhead_stride_o = nhead_stride_o;
-    }
-
-    __host__ static constexpr void InitKargsCommonBias(KargsCommon& kargs,
-                                                       const void* bias_ptr,
-                                                       ck::index_t stride_bias,
-                                                       ck::index_t nhead_stride_bias)
+    public:
+    using Kargs = std::conditional_t<kIsGroupMode, GroupModeKargs, BatchModeKargs>;
+
+    template <bool Cond = !kIsGroupMode && !kSupportsBias>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
+                                                                      const void* k_ptr,
+                                                                      const void* v_ptr,
+                                                                      void* o_ptr,
+                                                                      ck::index_t seqlen_q,
+                                                                      ck::index_t seqlen_k,
+                                                                      ck::index_t hdim_q,
+                                                                      ck::index_t hdim_v,
+                                                                      float scale,
+                                                                      ck::index_t stride_q,
+                                                                      ck::index_t stride_k,
+                                                                      ck::index_t stride_v,
+                                                                      ck::index_t stride_o,
+                                                                      ck::index_t nhead_stride_q,
+                                                                      ck::index_t nhead_stride_k,
+                                                                      ck::index_t nhead_stride_v,
+                                                                      ck::index_t nhead_stride_o,
+                                                                      ck::index_t batch_stride_q,
+                                                                      ck::index_t batch_stride_k,
+                                                                      ck::index_t batch_stride_v,
+                                                                      ck::index_t batch_stride_o)
     {
-        kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(bias_ptr);
-        kargs.stride_bias       = stride_bias;
-        kargs.nhead_stride_bias = nhead_stride_bias;
+        return Kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
+                     seqlen_k,       hdim_q,         hdim_v,         scale,          stride_q,
+                     stride_k,       stride_v,       stride_o,       nhead_stride_q, nhead_stride_k,
+                     nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k, batch_stride_v,
+                     batch_stride_o};
     }
 
-    // initialize kernel arguments for batch mode
-    __host__ static constexpr auto
+    template <bool Cond = !kIsGroupMode && kSupportsBias>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
@@ -161,44 +280,65 @@ struct FmhaFwdKernel
               std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias =
                   std::nullopt)
     {
-        KargsBatchMode kargs;
-
-        InitKargsCommon(kargs,
-                        q_ptr,
-                        k_ptr,
-                        v_ptr,
-                        o_ptr,
-                        seqlen_q,
-                        seqlen_k,
-                        hdim_q,
-                        hdim_v,
-                        scale,
-                        stride_q,
-                        stride_k,
-                        stride_v,
-                        stride_o,
-                        nhead_stride_q,
-                        nhead_stride_k,
-                        nhead_stride_v,
-                        nhead_stride_o);
-
-        kargs.batch_stride_q = batch_stride_q;
-        kargs.batch_stride_k = batch_stride_k;
-        kargs.batch_stride_v = batch_stride_v;
-        kargs.batch_stride_o = batch_stride_o;
+        Kargs kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
+                    seqlen_k,       hdim_q,         hdim_v,         scale,          stride_q,
+                    stride_k,       stride_v,       stride_o,       nhead_stride_q, nhead_stride_k,
+                    nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k, batch_stride_v,
+                    batch_stride_o};
 
         if(bias.has_value())
         {
-            InitKargsCommonBias(kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
-
+            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(std::get<0>(*bias));
+            kargs.stride_bias       = std::get<1>(*bias);
+            kargs.nhead_stride_bias = std::get<2>(*bias);
             kargs.batch_stride_bias = std::get<3>(*bias);
         }
 
         return kargs;
     }
 
-    // initialize kernel arguments for group mode
-    __host__ static constexpr auto
+    template <bool Cond = kIsGroupMode && !kSupportsBias>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
+                                                                      const void* k_ptr,
+                                                                      const void* v_ptr,
+                                                                      void* o_ptr,
+                                                                      const void* seqstart_q_ptr,
+                                                                      const void* seqstart_k_ptr,
+                                                                      const void* seqlen_k_ptr,
+                                                                      ck::index_t hdim_q,
+                                                                      ck::index_t hdim_v,
+                                                                      float scale,
+                                                                      ck::index_t stride_q,
+                                                                      ck::index_t stride_k,
+                                                                      ck::index_t stride_v,
+                                                                      ck::index_t stride_o,
+                                                                      ck::index_t nhead_stride_q,
+                                                                      ck::index_t nhead_stride_k,
+                                                                      ck::index_t nhead_stride_v,
+                                                                      ck::index_t nhead_stride_o)
+    {
+        return Kargs{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
+                     seqstart_q_ptr,
+                     seqstart_k_ptr,
+                     seqlen_k_ptr,
+                     hdim_q,
+                     hdim_v,
+                     scale,
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o};
+    }
+
+    template <bool Cond = kIsGroupMode&& kSupportsBias>
+    __host__ static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
@@ -219,36 +359,32 @@ struct FmhaFwdKernel
               ck::index_t nhead_stride_o,
               std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias = std::nullopt)
     {
-        KargsGroupMode kargs;
-
-        InitKargsCommon(kargs,
-                        q_ptr,
-                        k_ptr,
-                        v_ptr,
-                        o_ptr,
-                        -1, // seqlen_q will be updated inside the kernel
-                        -1, // seqlen_k will be updated inside the kernel
-                        hdim_q,
-                        hdim_v,
-                        scale,
-                        stride_q,
-                        stride_k,
-                        stride_v,
-                        stride_o,
-                        nhead_stride_q,
-                        nhead_stride_k,
-                        nhead_stride_v,
-                        nhead_stride_o);
+        Kargs kargs{q_ptr,
+                    k_ptr,
+                    v_ptr,
+                    o_ptr,
+                    seqstart_q_ptr,
+                    seqstart_k_ptr,
+                    seqlen_k_ptr,
+                    hdim_q,
+                    hdim_v,
+                    scale,
+                    stride_q,
+                    stride_k,
+                    stride_v,
+                    stride_o,
+                    nhead_stride_q,
+                    nhead_stride_k,
+                    nhead_stride_v,
+                    nhead_stride_o};
 
         if(bias.has_value())
         {
-            InitKargsCommonBias(kargs, std::get<0>(*bias), std::get<1>(*bias), std::get<2>(*bias));
+            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(std::get<0>(*bias));
+            kargs.stride_bias       = std::get<1>(*bias);
+            kargs.nhead_stride_bias = std::get<2>(*bias);
         }
 
-        kargs.seqstart_q_ptr = reinterpret_cast<const int32_t*>(seqstart_q_ptr);
-        kargs.seqstart_k_ptr = reinterpret_cast<const int32_t*>(seqstart_k_ptr);
-        kargs.seqlen_k_ptr   = reinterpret_cast<const int32_t*>(seqlen_k_ptr);
-
         return kargs;
     }
 
@@ -267,7 +403,6 @@ struct FmhaFwdKernel
         return ck::math::max(FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    template <typename Kargs>
     __device__ void operator()(Kargs kargs) const
     {
         using namespace ck;
@@ -290,17 +425,9 @@ struct FmhaFwdKernel
         index_t batch_offset_bias = 0;
         index_t batch_offset_o    = 0;
 
-        if constexpr(ck::is_same_v<Kargs, KargsBatchMode>)
+        if constexpr(kIsGroupMode)
         {
-            batch_offset_q    = i_batch * kargs.batch_stride_q;
-            batch_offset_k    = i_batch * kargs.batch_stride_k;
-            batch_offset_v    = i_batch * kargs.batch_stride_v;
-            batch_offset_bias = i_batch * kargs.batch_stride_bias;
-            batch_offset_o    = i_batch * kargs.batch_stride_o;
-        }
-        else
-        { // ck::is_same_v<Kargs, KargsGroupMode>
-            // get starting offset for each work batch
+            // get starting offset for each batch
             const index_t query_start = kargs.seqstart_q_ptr[i_batch];
             const index_t key_start   = kargs.seqstart_k_ptr[i_batch];
 
@@ -314,20 +441,20 @@ struct FmhaFwdKernel
             {
                 batch_offset_v = key_start;
             }
-            batch_offset_bias = query_start * kargs.stride_bias + key_start;
-            batch_offset_o    = query_start * kargs.stride_o;
+            if constexpr(kSupportsBias)
+            {
+                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+            }
+            else
+            {
+                batch_offset_bias = key_start;
+            }
+            batch_offset_o = query_start * kargs.stride_o;
 
             // get real # queries & # keys under group mode
             const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
             kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
 
-            // # of required blocks is different in each groups, terminate unnecessary
-            // blocks earlier
-            if(kargs.seqlen_q <= i_m0)
-            {
-                return;
-            }
-
             if(kargs.seqlen_k_ptr != nullptr)
             {
                 kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
@@ -338,17 +465,23 @@ struct FmhaFwdKernel
                 kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
             }
         }
+        else
+        {
+            batch_offset_q = i_batch * kargs.batch_stride_q;
+            batch_offset_k = i_batch * kargs.batch_stride_k;
+            batch_offset_v = i_batch * kargs.batch_stride_v;
+            if constexpr(kSupportsBias)
+            {
+                batch_offset_bias = i_batch * kargs.batch_stride_bias;
+            }
+            batch_offset_o = i_batch * kargs.batch_stride_o;
+        }
 
         // for simplicity, batch stride we just modify the pointer
         const QDataType* q_ptr = kargs.q_ptr + i_nhead * kargs.nhead_stride_q + batch_offset_q;
         const KDataType* k_ptr = kargs.k_ptr + i_nhead * kargs.nhead_stride_k + batch_offset_k;
         const VDataType* v_ptr = kargs.v_ptr + i_nhead * kargs.nhead_stride_v + batch_offset_v;
-        const BiasDataType* bias_ptr = nullptr;
-        if(kargs.bias_ptr != nullptr)
-        {
-            bias_ptr = kargs.bias_ptr + i_nhead * kargs.nhead_stride_bias + batch_offset_bias;
-        }
-        ODataType* o_ptr = kargs.o_ptr + i_nhead * kargs.nhead_stride_o + batch_offset_o;
+        ODataType* o_ptr       = kargs.o_ptr + i_nhead * kargs.nhead_stride_o + batch_offset_o;
 
         // Q/K/V DRAM and DRAM window
         const auto q_dram = [&]() {
@@ -361,7 +494,7 @@ struct FmhaFwdKernel
 
             return pad_tensor_view(q_dram_naive,
                                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
-                                   Sequence<MNeedPadding, false>{});
+                                   Sequence<kM0NeedPadding, false>{});
         }();
         const auto k_dram = [&]() {
             const auto k_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
@@ -373,7 +506,7 @@ struct FmhaFwdKernel
 
             return pad_tensor_view(k_dram_naive,
                                    make_tuple(Number<FmhaPipeline::kN0>{}, Number<1>{}),
-                                   Sequence<NNeedPadding, false>{});
+                                   Sequence<kN0K1NeedPadding, false>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
@@ -392,16 +525,11 @@ struct FmhaFwdKernel
                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-                /// FIXME: The return value of
-                /// v_dram_naive.GetTensorDescriptor().GetLength() is same as
-                ///   v_dram_transposed.GetTensorDescriptor().GetLength(). Replace
-                ///   following if-clause by pad_tensor_view() call after fixing this
-                ///   issue.
-                if constexpr(!NNeedPadding)
-                {
-                    return v_dram_transposed;
-                }
-                else
+                /// FIXME: The return value of v_dram_naive.GetTensorDescriptor().GetLength() is
+                /// same as
+                ///   v_dram_transposed.GetTensorDescriptor().GetLength(). Replace following
+                ///   if-clause by pad_tensor_view() call after fixing this issue.
+                if constexpr(kN0K1NeedPadding)
                 {
                     const index_t pad_length =
                         FmhaPipeline::kK1 *
@@ -415,6 +543,10 @@ struct FmhaFwdKernel
                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                         make_tuple(Sequence<0>{}, Sequence<1>{}));
                 }
+                else
+                {
+                    return v_dram_transposed;
+                }
             }
             else
             {
@@ -427,7 +559,7 @@ struct FmhaFwdKernel
 
                 return pad_tensor_view(v_dram_naive,
                                        make_tuple(Number<1>{}, Number<FmhaPipeline::kK1>{}),
-                                       Sequence<false, NNeedPadding>{});
+                                       Sequence<false, kN0K1NeedPadding>{});
             }
         }();
 
@@ -451,58 +583,63 @@ struct FmhaFwdKernel
                              {i_n1, 0});
 
         const auto run_pipeline_with = [&](auto bias_dram_window) {
-            const auto s_mask = [&]() {
-                if constexpr(NNeedPadding)
-                {
-                    return [&](index_t /* m */, index_t n) {
-                        const bool is_out_of_bound = !(n < kargs.seqlen_k);
-                        return is_out_of_bound;
-                    };
-                }
-                else
-                {
-                    return NullMask{};
-                }
-            }();
+            C0MatrixMask casual_mask{kargs.seqlen_q, kargs.seqlen_k};
 
             return FmhaPipeline{}(q_dram_window,
                                   k_dram_window,
                                   v_dram_window,
                                   bias_dram_window,
-                                  s_mask,
+                                  casual_mask,
                                   kargs.scale,
                                   ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
                                   ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
                                   smem_ptr);
         };
 
-        auto o_acc_tile = [&]() {
+        /// FIXME: Before C++20, capturing structured binding variables is not supported. Remove
+        /// following copy capture of the 'i_nhead'
+        ///        if compiled in C++20
+        auto o_acc_tile = [&, i_nhead_ = i_nhead]() {
             constexpr auto bias_dram_window_lengths =
                 make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
 
-            if(bias_ptr != nullptr)
+            if constexpr(kSupportsBias)
             {
-                const auto bias_dram = [&]() {
-                    const auto bias_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                        bias_ptr,
-                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                        make_tuple(kargs.stride_bias, 1),
-                        Number<32>{},
-                        Number<1>{});
-
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           Sequence<MNeedPadding, NNeedPadding>{});
-                }();
-
-                auto bias_dram_window =
-                    make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-
-                return run_pipeline_with(bias_dram_window);
+                if(kargs.bias_ptr != nullptr)
+                {
+                    const BiasDataType* bias_ptr =
+                        kargs.bias_ptr + i_nhead_ * kargs.nhead_stride_bias + batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<AddressSpaceEnum::Global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                Number<32>{},
+                                Number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               Sequence<kM0NeedPadding, kN0K1NeedPadding>{});
+                    }();
+
+                    const auto bias_dram_window =
+                        make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+
+                    return run_pipeline_with(bias_dram_window);
+                }
+                else
+                {
+                    const auto dummy_bias_dram_window =
+                        make_null_tile_window(bias_dram_window_lengths);
+
+                    return run_pipeline_with(dummy_bias_dram_window);
+                }
             }
             else
             {
-                auto dummy_bias_dram_window = make_null_tile_window(bias_dram_window_lengths);
+                const auto dummy_bias_dram_window = make_null_tile_window(bias_dram_window_lengths);
 
                 return run_pipeline_with(dummy_bias_dram_window);
             }
@@ -519,7 +656,7 @@ struct FmhaFwdKernel
 
             return pad_tensor_view(o_dram_naive,
                                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
-                                   Sequence<MNeedPadding, false>{});
+                                   Sequence<kM0NeedPadding, false>{});
         }();
 
         auto o_dram_window =
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index b52086fd7..4bac3a433 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -18,14 +18,13 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_util.h"
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
+#include "ck_tiled_fmha_definitions.h"
 
-template <typename scalar_t, int32_t custom_mask_type>
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 struct grouped_infer_masktype_attnbias_dispatched
 {
     using QDataType           = scalar_t;
@@ -40,6 +39,9 @@ struct grouped_infer_masktype_attnbias_dispatched
 
     using VLayout = ck::tensor_layout::gemm::RowMajor;
 
+    static constexpr auto masktype = static_cast<CausalMaskType>(custom_mask_type);
+    using FmhaCausalMask           = typename CausalMaskPredicate<masktype>::predicate;
+
     using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
     using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
     using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
@@ -99,16 +101,17 @@ struct grouped_infer_masktype_attnbias_dispatched
                                                                   OaccDataType,
                                                                   ODataType,
                                                                   256, // BlockSize
-                                                                  FmhaShape>;
+                                                                  FmhaShape,
+                                                                  true, // IsGroupMode
+                                                                  true, // kM0NeedPadding
+                                                                  true, // kN0K1Needpadding
+                                                                  has_attn_bias,
+                                                                  FmhaCausalMask>;
 
             using FmhaPipeline =
                 ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
 
-            using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner,
-                                             FmhaPipeline,
-                                             FmhaEpilogue,
-                                             MNeedPadding,
-                                             NNeedPadding>;
+            using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
             RunWithKernel<FmhaKernel>(param, stream);
         });
@@ -117,6 +120,59 @@ struct grouped_infer_masktype_attnbias_dispatched
     template <typename FmhaKernel>
     static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream)
     {
+        const auto kargs = [&] {
+            if constexpr(FmhaKernel::kSupportsBias)
+            {
+                std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias;
+
+                bias = std::make_tuple(
+                    param.attn_bias_ptr, param.attn_bias_strides[2], param.attn_bias_strides[1]);
+
+                return FmhaKernel::MakeKargs(
+                    param.q_ptr,
+                    param.k_ptr,
+                    param.v_ptr,
+                    param.out_ptr,
+                    param.seqstart_q_dev_ptr,
+                    param.seqstart_k_dev_ptr,
+                    param.seqlen_k_dev_ptr,
+                    param.K,  // hdim_q
+                    param.Kv, // hdim_v
+                    param.scale,
+                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                    param.k_strides[1],
+                    param.v_strides[1],
+                    param.out_strides[1],
+                    param.q_strides[2], // q, k, v, out tensor head-dim stride
+                    param.k_strides[2],
+                    param.v_strides[2],
+                    param.out_strides[2],
+                    bias);
+            }
+            else
+            {
+                return FmhaKernel::MakeKargs(
+                    param.q_ptr,
+                    param.k_ptr,
+                    param.v_ptr,
+                    param.out_ptr,
+                    param.seqstart_q_dev_ptr,
+                    param.seqstart_k_dev_ptr,
+                    param.seqlen_k_dev_ptr,
+                    param.K,  // hdim_q
+                    param.Kv, // hdim_v
+                    param.scale,
+                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                    param.k_strides[1],
+                    param.v_strides[1],
+                    param.out_strides[1],
+                    param.q_strides[2], // q, k, v, out tensor head-dim stride
+                    param.k_strides[2],
+                    param.v_strides[2],
+                    param.out_strides[2]);
+            };
+        }();
+
         dim3 kGridSize = FmhaKernel::GridSize(param.num_batches, param.Hq, param.M, param.Kv);
         constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
 
@@ -124,42 +180,14 @@ struct grouped_infer_masktype_attnbias_dispatched
         constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
         constexpr ck::index_t kBlockPerCu   = kWarpPerCu / kWarpPerBlock;
 
-        std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias;
-
-        if(param.has_attn_bias)
-        {
-            bias = std::make_tuple(
-                param.attn_bias_ptr, param.attn_bias_strides[2], param.attn_bias_strides[1]);
-        };
-
-        auto kargs =
-            FmhaKernel::MakeKargs(param.q_ptr,
-                                  param.k_ptr,
-                                  param.v_ptr,
-                                  param.out_ptr,
-                                  param.seqstart_q_dev_ptr,
-                                  param.seqstart_k_dev_ptr,
-                                  param.seqlen_k_dev_ptr,
-                                  param.K,  // hdim_q
-                                  param.Kv, // hdim_v
-                                  param.scale,
-                                  param.q_strides[1], // q, k, v, out tensor seq-dim stride
-                                  param.k_strides[1],
-                                  param.v_strides[1],
-                                  param.out_strides[1],
-                                  param.q_strides[2], // q, k, v, out tensor head-dim stride
-                                  param.k_strides[2],
-                                  param.v_strides[2],
-                                  param.out_strides[2],
-                                  bias);
-
         (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
             StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
     };
 };
 
-template <typename scalar_t, int32_t custom_mask_type>
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
 void run_grouped_infer_masktype_attnbias_dispatched(GroupedForwardParams& param, hipStream_t stream)
 {
-    grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type>::Run(param, stream);
+    grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+        param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 3954ee4ff..659fd286b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -5,28 +5,43 @@
 #include "ck_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  if (param.custom_mask_type == 0)
-    run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
-        param, stream);
-  else if (param.custom_mask_type == 1)
-    run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
-        param, stream);
-  else if (param.custom_mask_type == 2)
-    run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
-        param, stream);
-  else
-    throw std::runtime_error("Invalid custom_mask_type value");
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 1)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 2)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp
deleted file mode 100644
index 2915b07ed..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
-    BatchedForwardParams& param,
-    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..8f4c31ab3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..783fb5e16
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp
deleted file mode 100644
index 8d7f2bbf8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
-    BatchedForwardParams& param,
-    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..7be550de2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..9276ca53f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp
deleted file mode 100644
index b608b8939..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
-    BatchedForwardParams& param,
-    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..da3f5004e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..189d295d2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp
deleted file mode 100644
index 8117f8b58..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0>(
-    GroupedForwardParams& param,
-    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..100150751
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..3b323b7bb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp
deleted file mode 100644
index d1b93e583..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1>(
-    GroupedForwardParams& param,
-    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..6fad32f78
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..39646e941
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp
deleted file mode 100644
index 246b90a77..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2>(
-    GroupedForwardParams& param,
-    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..ba5384e43
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..f6e4a4215
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,7 @@
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);

From 516f2ed0dd3730fdc0b1f067d5f1b44037682c16 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 5 Dec 2023 12:23:55 +0000
Subject: [PATCH 250/641] Fix bug in ck-tiled grouped-mode C++ extension

---
 .../attention_forward_generic_ck_tiled.cpp    |  2 +-
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |  3 +++
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 26 ++++++++++---------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 0c87daa97..e392935ce 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -317,7 +317,7 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
             p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
 
             HIP_CALL_CHECK(hipMemcpyAsync(p.seqlen_k_dev_ptr,
-                                          seqstart_k->data_ptr(),
+                                          seqlen_k->data_ptr(),
                                           p.num_batches * sizeof(int),
                                           hipMemcpyHostToDevice,
                                           stream));
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 169458efe..41eb3f748 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -464,6 +464,9 @@ struct FmhaFwdKernel
                 const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
                 kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
             }
+
+            if(i_m0 >= kargs.seqlen_q)
+                return;
         }
         else
         {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 4bac3a433..e1ad7b1a8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -3,6 +3,7 @@
 #include <optional>
 #include <sstream>
 #include <stdexcept>
+#include <iostream>
 
 #include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
@@ -139,14 +140,14 @@ struct grouped_infer_masktype_attnbias_dispatched
                     param.K,  // hdim_q
                     param.Kv, // hdim_v
                     param.scale,
-                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                    param.q_strides[0], // q, k, v, out tensor seq-dim stride
+                    param.k_strides[0],
+                    param.v_strides[0],
+                    param.out_strides[0],
+                    param.q_strides[1], // q, k, v, out tensor head-dim stride
                     param.k_strides[1],
                     param.v_strides[1],
                     param.out_strides[1],
-                    param.q_strides[2], // q, k, v, out tensor head-dim stride
-                    param.k_strides[2],
-                    param.v_strides[2],
-                    param.out_strides[2],
                     bias);
             }
             else
@@ -162,18 +163,19 @@ struct grouped_infer_masktype_attnbias_dispatched
                     param.K,  // hdim_q
                     param.Kv, // hdim_v
                     param.scale,
-                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
+                    param.q_strides[0], // q, k, v, out tensor seq-dim stride
+                    param.k_strides[0],
+                    param.v_strides[0],
+                    param.out_strides[0],
+                    param.q_strides[1], // q, k, v, out tensor head-dim stride
                     param.k_strides[1],
                     param.v_strides[1],
-                    param.out_strides[1],
-                    param.q_strides[2], // q, k, v, out tensor head-dim stride
-                    param.k_strides[2],
-                    param.v_strides[2],
-                    param.out_strides[2]);
+                    param.out_strides[1]);
             };
         }();
 
-        dim3 kGridSize = FmhaKernel::GridSize(param.num_batches, param.Hq, param.M, param.Kv);
+        dim3 kGridSize =
+            FmhaKernel::GridSize(param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
         constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
 
         constexpr ck::index_t kWarpPerCu    = 8; // 2 warps per SIMD

From af6964d577a5058301c5726b4a1ac2883c1f9d4e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 5 Dec 2023 18:14:33 +0000
Subject: [PATCH 251/641] Synchronize with latest feature update from
 feature/fmah-pad-support branch

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 08d9e56f2..ddce91a44 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 08d9e56f2e321016934fb0c44673af4c0754171f
+Subproject commit ddce91a44b2da6eb74e7e3d7bf14b54930719983

From ee53b8314c3a8f2e2e38a9e9a010b984a61dd0ac Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Dec 2023 12:20:42 +0000
Subject: [PATCH 252/641] Synchronize the latest third_party/composable_kernel
 and update .gitmodules

---
 .gitmodules                   | 4 ----
 third_party/composable_kernel | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index bf2678053..94eb8135c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,7 +8,3 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
-[submodule "third_party/composable_kernel_tiled"]
-	path = third_party/composable_kernel_tiled
-	url = https://github.com/asroy/ck_tile
-        branch = feature/fmha-pad-support
diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 2f93e26f5..5f4e6ec00 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 2f93e26f55ce0e9839c358c0c713ce8eb3db38a2
+Subproject commit 5f4e6ec00d12654e3897f53b48307434cd25a02f

From a816112d076e33c0c702fcd3e2f1bb64c37ece37 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Dec 2023 14:38:25 +0000
Subject: [PATCH 253/641] Add license declaration and re-format with
 clang-format-10

---
 .../hip_fmha/attention_backward_generic.cpp   | 975 +++++++++---------
 .../hip_fmha/attention_ck_rand_uniform.cpp    | 173 ++--
 .../hip_fmha/attention_forward_decoder.cpp    | 458 ++++----
 .../hip_fmha/attention_forward_generic.cpp    | 729 ++++++-------
 .../attention_forward_generic_ck_tiled.cpp    |   6 +
 .../csrc/attention/hip_fmha/ck_align_switch.h | 298 +++---
 .../hip_fmha/ck_attention_forward_decoder.h   | 880 ++++++++--------
 .../csrc/attention/hip_fmha/ck_bool_switch.h  |  50 +-
 .../ck_fmha_backward_gemm_constants.h         | 350 ++++---
 .../hip_fmha/ck_fmha_batched_backward.h       | 661 ++++++------
 .../ck_fmha_batched_backward_bp16.cpp         | 143 +--
 .../ck_fmha_batched_backward_fp16.cpp         | 140 +--
 .../hip_fmha/ck_fmha_batched_forward.h        | 520 +++++-----
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp |  95 +-
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp |  95 +-
 .../hip_fmha/ck_fmha_batched_infer.h          | 488 +++++----
 .../hip_fmha/ck_fmha_batched_infer_bp16.cpp   |  95 +-
 .../hip_fmha/ck_fmha_batched_infer_fp16.cpp   |  95 +-
 .../hip_fmha/ck_fmha_common_gemm_constants.h  |  34 +-
 .../hip_fmha/ck_fmha_forward_gemm_constants.h |   6 +
 .../hip_fmha/ck_fmha_grouped_backward.h       | 678 ++++++------
 .../ck_fmha_grouped_backward_bp16.cpp         | 149 ++-
 .../ck_fmha_grouped_backward_fp16.cpp         | 146 ++-
 .../hip_fmha/ck_fmha_grouped_forward.h        | 534 +++++-----
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp |  95 +-
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp |  95 +-
 .../hip_fmha/ck_fmha_grouped_infer.h          | 509 ++++-----
 .../hip_fmha/ck_fmha_grouped_infer_bp16.cpp   |  95 +-
 .../hip_fmha/ck_fmha_grouped_infer_fp16.cpp   |  95 +-
 .../hip_fmha/ck_fmha_infer_gemm_constants.h   |   6 +
 .../attention/hip_fmha/ck_fmha_op_helper.h    |  45 +-
 .../csrc/attention/hip_fmha/ck_fmha_params.h  | 382 +++----
 .../csrc/attention/hip_fmha/ck_fmha_test.cpp  |  23 +-
 .../csrc/attention/hip_fmha/ck_fmha_util.h    | 224 ++--
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |   6 +
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |   6 +
 .../hip_fmha/ck_tiled_fmha_definitions.h      |   6 +
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |   6 +
 .../hip_fmha/ck_tiled_fmha_fwd_epilogue.h     |   6 +
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |   6 +
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |   6 +
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |   6 +
 .../attention/hip_fmha/ck_tiled_fmha_params.h | 368 +++----
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._bp16_masktype_0_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_bp16_masktype_0_with_attnbias.cpp |  13 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._bp16_masktype_1_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_bp16_masktype_1_with_attnbias.cpp |  13 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._bp16_masktype_2_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_bp16_masktype_2_with_attnbias.cpp |  13 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._fp16_masktype_0_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_fp16_masktype_0_with_attnbias.cpp |  13 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._fp16_masktype_1_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_fp16_masktype_1_with_attnbias.cpp |  13 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._fp16_masktype_2_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_fp16_masktype_2_with_attnbias.cpp |  13 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |  13 +-
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._forward_bp16_masktype_0_with_attnbias.cpp |  13 +-
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._forward_bp16_masktype_1_with_attnbias.cpp |  13 +-
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._forward_bp16_masktype_2_with_attnbias.cpp |  13 +-
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._forward_fp16_masktype_0_with_attnbias.cpp |  13 +-
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._forward_fp16_masktype_1_with_attnbias.cpp |  13 +-
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._forward_fp16_masktype_2_with_attnbias.cpp |  13 +-
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |  13 +-
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |  13 +-
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |  13 +-
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |  13 +-
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |  13 +-
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |  13 +-
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |  13 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |  13 +-
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |  13 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |  13 +-
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |  13 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |  13 +-
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._bp16_masktype_0_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_bp16_masktype_0_with_attnbias.cpp |  13 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._bp16_masktype_1_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_bp16_masktype_1_with_attnbias.cpp |  13 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._bp16_masktype_2_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_bp16_masktype_2_with_attnbias.cpp |  13 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._fp16_masktype_0_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_fp16_masktype_0_with_attnbias.cpp |  13 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._fp16_masktype_1_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_fp16_masktype_1_with_attnbias.cpp |  13 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |  13 +-
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._fp16_masktype_2_no_attnbias_fp32_grad.cpp |  13 +-
 ...backward_fp16_masktype_2_with_attnbias.cpp |  13 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |  13 +-
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._forward_bp16_masktype_0_with_attnbias.cpp |  13 +-
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._forward_bp16_masktype_1_with_attnbias.cpp |  13 +-
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._forward_bp16_masktype_2_with_attnbias.cpp |  13 +-
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |  13 +-
 ..._forward_fp16_masktype_0_with_attnbias.cpp |  13 +-
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |  13 +-
 ..._forward_fp16_masktype_1_with_attnbias.cpp |  13 +-
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |  13 +-
 ..._forward_fp16_masktype_2_with_attnbias.cpp |  13 +-
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |  13 +-
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |  13 +-
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |  13 +-
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |  13 +-
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |  13 +-
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |  13 +-
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |  13 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |  13 +-
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |  13 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |  13 +-
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |  13 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |  13 +-
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |   6 +
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   6 +
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |   6 +
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   6 +
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |   6 +
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   6 +
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |   6 +
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   6 +
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |   6 +
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   6 +
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |   6 +
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   6 +
 151 files changed, 5789 insertions(+), 5314 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index c513664f2..282b9aabd 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <cmath>
 #include <cstdlib>
 
@@ -11,23 +17,14 @@
 #include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
-extern void batched_backward_fp16(
-    BatchedBackwardParams& param,
-    hipStream_t stream);
-extern void batched_backward_bp16(
-    BatchedBackwardParams& param,
-    hipStream_t stream);
-extern void grouped_backward_fp16(
-    GroupedBackwardParams& param,
-    hipStream_t stream);
-extern void grouped_backward_bp16(
-    GroupedBackwardParams& param,
-    hipStream_t stream);
+extern void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream);
+extern void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream);
+extern void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream);
+extern void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream);
 
 namespace {
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-efficient_attention_backward_ck(
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> efficient_attention_backward_ck(
     const at::Tensor& grad_out,
     const at::Tensor& query,
     const at::Tensor& key,
@@ -44,523 +41,527 @@ efficient_attention_backward_ck(
     const c10::optional<at::Tensor>& seqlen_k,
     const at::Tensor& logsumexp,
     const at::Tensor& out,
-    double dropout_p, // dropout probability
-    int64_t rng_seed, // seed using for generating random numbers for dropout
+    double dropout_p,   // dropout probability
+    int64_t rng_seed,   // seed using for generating random numbers for dropout
     int64_t rng_offset, // offset into random number sequence
     int64_t custom_mask_type,
-    const c10::optional<double> scale) {
+    const c10::optional<double> scale)
+{
 #ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
-  TORCH_CHECK(
-      false,
-      "MemoryEfficient build has been disabled at build time with -DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
+    TORCH_CHECK(false,
+                "MemoryEfficient build has been disabled at build time with "
+                "-DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
 #else
-  at::globalContext().alertNotDeterministic(
-      "mem_efficient_attention_backward_cutlass");
-
-  // ndim
-  TORCH_CHECK(query.dim() == grad_out.dim());
-  TORCH_CHECK(query.dim() == key.dim());
-  TORCH_CHECK(query.dim() == value.dim());
-  TORCH_CHECK(query.dim() == 4);
-
-  // batch size
-  TORCH_CHECK(query.size(0) == grad_out.size(0));
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // seqlen
-  TORCH_CHECK(key.size(1) == value.size(1));
-  TORCH_CHECK(query.size(1) == grad_out.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) % key.size(2) == 0);
-  TORCH_CHECK(key.size(2) == value.size(2));
-  TORCH_CHECK(query.size(2) == grad_out.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-  TORCH_CHECK(value.size(3) == grad_out.size(3));
-
-  // CK-FlashAttn requires out, grad_out to have same shapes
-  TORCH_CHECK(out.sizes() == grad_out.sizes());
-  TORCH_CHECK(out.strides() == grad_out.strides());
-
-  // last dim is contiguous, device is CUDA
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  // logsumexp should be completely contiguous
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  TORCH_CHECK(
-      !(seqstart_q.has_value() && bias.has_value()),
-      "seqstart_q + bias not supported");
-
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-  }
-
-  bool use_fp32_qkv_grad = false;
-
-  if (const char* env_str = std::getenv("USE_FP32_QKV_GRAD")) {
-    use_fp32_qkv_grad = (std::stoi(env_str) > 0) ? true : false;
-  };
-
-  // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t Hq = query.size(2);
-  int64_t Hkv = key.size(2);
-  int64_t K = query.size(3);
-  int64_t Kv = value.size(3);
-
-  auto opts = query.options();
-
-  at::Tensor grad_q, grad_k, grad_v, grad_bias;
-
-  if (query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
-      query.size(2) == key.size(2) &&
-      query.storage().is_alias_of(key.storage()) &&
-      query.storage().is_alias_of(value.storage())) {
-    // Create one big contiguous chunk for grad_q, grad_k, grad_v
-    // This is because q, k and v usually come from a single
-    // output of a linear layer that is chunked.
-    // Creating the gradients with the right layout saves us
-    // a `torch.cat` call in the backward pass
-    at::Tensor chunk;
-    if (use_fp32_qkv_grad)
-      chunk = at::empty({B, M, 3, Hq, K}, opts.dtype(at::kFloat));
-    else
-      chunk = at::empty({B, M, 3, Hq, K}, opts);
-    grad_q = chunk.select(2, 0);
-    grad_k = chunk.select(2, 1);
-    grad_v = chunk.select(2, 2);
-    grad_q.fill_(0);
-  } else if (
-      key.size(3) == value.size(3) &&
-      key.storage().is_alias_of(value.storage())) {
-    // Create one big contiguous chunk for grad_k, grad_v
-    // This is because k and v usually come from a single
-    // output of a linear layer that is chunked.
-    // Creating the gradients with the right layout saves us
-    // a `torch.cat` call in the backward pass
-    at::Tensor chunk;
-    if (use_fp32_qkv_grad)
-      chunk = at::empty({B, N, 2, Hkv, Kv}, opts.dtype(at::kFloat));
-    else
-      chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
-    grad_k = chunk.select(2, 0);
-    grad_v = chunk.select(2, 1);
+    at::globalContext().alertNotDeterministic("mem_efficient_attention_backward_cutlass");
+
+    // ndim
+    TORCH_CHECK(query.dim() == grad_out.dim());
+    TORCH_CHECK(query.dim() == key.dim());
+    TORCH_CHECK(query.dim() == value.dim());
+    TORCH_CHECK(query.dim() == 4);
+
+    // batch size
+    TORCH_CHECK(query.size(0) == grad_out.size(0));
+    TORCH_CHECK(query.size(0) == key.size(0));
+    TORCH_CHECK(query.size(0) == value.size(0));
+
+    // seqlen
+    TORCH_CHECK(key.size(1) == value.size(1));
+    TORCH_CHECK(query.size(1) == grad_out.size(1));
+
+    // Num heads
+    TORCH_CHECK(query.size(2) % key.size(2) == 0);
+    TORCH_CHECK(key.size(2) == value.size(2));
+    TORCH_CHECK(query.size(2) == grad_out.size(2));
+
+    // Embedding per head
+    TORCH_CHECK(query.size(3) == key.size(3));
+    TORCH_CHECK(value.size(3) == grad_out.size(3));
+
+    // CK-FlashAttn requires out, grad_out to have same shapes
+    TORCH_CHECK(out.sizes() == grad_out.sizes());
+    TORCH_CHECK(out.strides() == grad_out.strides());
+
+    // last dim is contiguous, device is CUDA
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+    // logsumexp should be completely contiguous
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
+
+    TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+    TORCH_CHECK(!(seqstart_q.has_value() && bias.has_value()), "seqstart_q + bias not supported");
+
+    if(seqstart_q.has_value())
+    {
+        TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
+        TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+        TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
+        TORCH_CHECK(max_seqlen_q_.has_value());
+    }
 
-    if (use_fp32_qkv_grad)
-      grad_q = at::empty_strided(
-          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
-    else
-      grad_q =
-          at::empty_strided(query.sizes(), query.strides(), query.options());
-    grad_q.fill_(0);
-  } else {
-    if (use_fp32_qkv_grad) {
-      grad_q = at::empty_strided(
-          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
-      grad_k = at::empty_strided(
-          key.sizes(), key.strides(), key.options().dtype(at::kFloat));
-      grad_v = at::empty_strided(
-          value.sizes(), value.strides(), value.options().dtype(at::kFloat));
-    } else {
-      grad_q =
-          at::empty_strided(query.sizes(), query.strides(), query.options());
-      grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
-      grad_v =
-          at::empty_strided(value.sizes(), value.strides(), value.options());
+    bool use_fp32_qkv_grad = false;
+
+    if(const char* env_str = std::getenv("USE_FP32_QKV_GRAD"))
+    {
+        use_fp32_qkv_grad = (std::stoi(env_str) > 0) ? true : false;
+    };
+
+    // at::cuda::CUDAGuard device_guard(query.device());
+    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+    int64_t B   = query.size(0);
+    int64_t M   = query.size(1);
+    int64_t N   = key.size(1);
+    int64_t Hq  = query.size(2);
+    int64_t Hkv = key.size(2);
+    int64_t K   = query.size(3);
+    int64_t Kv  = value.size(3);
+
+    auto opts = query.options();
+
+    at::Tensor grad_q, grad_k, grad_v, grad_bias;
+
+    if(query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
+       query.size(2) == key.size(2) && query.storage().is_alias_of(key.storage()) &&
+       query.storage().is_alias_of(value.storage()))
+    {
+        // Create one big contiguous chunk for grad_q, grad_k, grad_v
+        // This is because q, k and v usually come from a single
+        // output of a linear layer that is chunked.
+        // Creating the gradients with the right layout saves us
+        // a `torch.cat` call in the backward pass
+        at::Tensor chunk;
+        if(use_fp32_qkv_grad)
+            chunk = at::empty({B, M, 3, Hq, K}, opts.dtype(at::kFloat));
+        else
+            chunk = at::empty({B, M, 3, Hq, K}, opts);
+        grad_q = chunk.select(2, 0);
+        grad_k = chunk.select(2, 1);
+        grad_v = chunk.select(2, 2);
+        grad_q.fill_(0);
     }
-    grad_q.fill_(0);
-  }
-
-  // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
-  TORCH_CHECK(query.sizes() == grad_q.sizes());
-  TORCH_CHECK(query.strides() == grad_q.strides());
-  TORCH_CHECK(key.sizes() == grad_k.sizes());
-  TORCH_CHECK(key.strides() == grad_k.strides());
-  TORCH_CHECK(value.sizes() == grad_v.sizes());
-  TORCH_CHECK(value.strides() == grad_v.strides());
-
-  const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
-
-  // even it is an output, the grad_bias is required to use the same data-type
-  // as bias in CK-FlashAttn
-  if (bias_requires_grad)
-    grad_bias =
-        at::empty_strided(bias->sizes(), bias->strides(), bias->options());
-
-  bool is_mqa_gqa = (Hq > Hkv);
-
-  at::Tensor tmp_grad_k, tmp_grad_v;
-
-  if (is_mqa_gqa) {
-    // allocate tmp_grad_k/tmp_grad_v which will be reduce to
-    // grad_k/grad_v for returning
-    if (use_fp32_qkv_grad) {
-      tmp_grad_k = at::empty({B, N, Hq, K}, opts.dtype(at::kFloat));
-      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts.dtype(at::kFloat));
-    } else {
-      tmp_grad_k = at::empty({B, N, Hq, K}, opts);
-      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
+    else if(key.size(3) == value.size(3) && key.storage().is_alias_of(value.storage()))
+    {
+        // Create one big contiguous chunk for grad_k, grad_v
+        // This is because k and v usually come from a single
+        // output of a linear layer that is chunked.
+        // Creating the gradients with the right layout saves us
+        // a `torch.cat` call in the backward pass
+        at::Tensor chunk;
+        if(use_fp32_qkv_grad)
+            chunk = at::empty({B, N, 2, Hkv, Kv}, opts.dtype(at::kFloat));
+        else
+            chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
+        grad_k = chunk.select(2, 0);
+        grad_v = chunk.select(2, 1);
+
+        if(use_fp32_qkv_grad)
+            grad_q = at::empty_strided(
+                query.sizes(), query.strides(), query.options().dtype(at::kFloat));
+        else
+            grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+        grad_q.fill_(0);
     }
-  }
-
-  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
-    p.is_mqa_gqa = is_mqa_gqa;
-
-    TORCH_CHECK(p.B == logsumexp.size(0));
-    TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.M == logsumexp.size(2));
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
+    else
+    {
+        if(use_fp32_qkv_grad)
+        {
+            grad_q = at::empty_strided(
+                query.sizes(), query.strides(), query.options().dtype(at::kFloat));
+            grad_k = at::empty_strided(key.sizes(), key.strides(), key.options().dtype(at::kFloat));
+            grad_v = at::empty_strided(
+                value.sizes(), value.strides(), value.options().dtype(at::kFloat));
+        }
+        else
+        {
+            grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+            grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
+            grad_v = at::empty_strided(value.sizes(), value.strides(), value.options());
+        }
+        grad_q.fill_(0);
     }
 
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.grad_out_ptr = grad_out.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.grad_q_ptr = grad_q.data_ptr();
-    p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
-    p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(0)),
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (is_mqa_gqa) {
-      p.tmp_grad_k_strides = {
-          static_cast<int>(tmp_grad_k.stride(0)),
-          static_cast<int>(tmp_grad_k.stride(1)),
-          static_cast<int>(tmp_grad_k.stride(2)),
-          static_cast<int>(tmp_grad_k.stride(3))};
-      p.tmp_grad_v_strides = {
-          static_cast<int>(tmp_grad_v.stride(0)),
-          static_cast<int>(tmp_grad_v.stride(1)),
-          static_cast<int>(tmp_grad_v.stride(2)),
-          static_cast<int>(tmp_grad_v.stride(3))};
+    // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
+    TORCH_CHECK(query.sizes() == grad_q.sizes());
+    TORCH_CHECK(query.strides() == grad_q.strides());
+    TORCH_CHECK(key.sizes() == grad_k.sizes());
+    TORCH_CHECK(key.strides() == grad_k.strides());
+    TORCH_CHECK(value.sizes() == grad_v.sizes());
+    TORCH_CHECK(value.strides() == grad_v.strides());
+
+    const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
+
+    // even it is an output, the grad_bias is required to use the same data-type
+    // as bias in CK-FlashAttn
+    if(bias_requires_grad)
+        grad_bias = at::empty_strided(bias->sizes(), bias->strides(), bias->options());
+
+    bool is_mqa_gqa = (Hq > Hkv);
+
+    at::Tensor tmp_grad_k, tmp_grad_v;
+
+    if(is_mqa_gqa)
+    {
+        // allocate tmp_grad_k/tmp_grad_v which will be reduce to
+        // grad_k/grad_v for returning
+        if(use_fp32_qkv_grad)
+        {
+            tmp_grad_k = at::empty({B, N, Hq, K}, opts.dtype(at::kFloat));
+            tmp_grad_v = at::empty({B, N, Hq, Kv}, opts.dtype(at::kFloat));
+        }
+        else
+        {
+            tmp_grad_k = at::empty({B, N, Hq, K}, opts);
+            tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
+        }
     }
 
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+    auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
+        p.B   = B;
+        p.M   = M;
+        p.N   = N;
+        p.Hq  = Hq;
+        p.Hkv = Hkv;
+        p.K   = K;
+        p.Kv  = Kv;
+
+        p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+        p.is_mqa_gqa        = is_mqa_gqa;
+
+        TORCH_CHECK(p.B == logsumexp.size(0));
+        TORCH_CHECK(p.Hq == logsumexp.size(1));
+        TORCH_CHECK(p.M == logsumexp.size(2));
+
+        if(scale.has_value())
+        {
+            p.scale = float(*scale);
+        }
+        else
+        {
+            p.scale = float(1.0 / std::sqrt(float(K)));
+        }
 
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = bias->data_ptr();
+        p.q_ptr        = query.data_ptr();
+        p.k_ptr        = key.data_ptr();
+        p.v_ptr        = value.data_ptr();
+        p.grad_out_ptr = grad_out.data_ptr();
+        p.out_ptr      = out.data_ptr();
+
+        p.grad_q_ptr = grad_q.data_ptr();
+        p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
+        p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
+
+        p.q_strides   = {static_cast<int>(query.stride(0)),
+                       static_cast<int>(query.stride(1)),
+                       static_cast<int>(query.stride(2)),
+                       static_cast<int>(query.stride(3))};
+        p.k_strides   = {static_cast<int>(key.stride(0)),
+                       static_cast<int>(key.stride(1)),
+                       static_cast<int>(key.stride(2)),
+                       static_cast<int>(key.stride(3))};
+        p.v_strides   = {static_cast<int>(value.stride(0)),
+                       static_cast<int>(value.stride(1)),
+                       static_cast<int>(value.stride(2)),
+                       static_cast<int>(value.stride(3))};
+        p.out_strides = {static_cast<int>(out.stride(0)),
+                         static_cast<int>(out.stride(1)),
+                         static_cast<int>(out.stride(2)),
+                         static_cast<int>(out.stride(3))};
+
+        if(is_mqa_gqa)
+        {
+            p.tmp_grad_k_strides = {static_cast<int>(tmp_grad_k.stride(0)),
+                                    static_cast<int>(tmp_grad_k.stride(1)),
+                                    static_cast<int>(tmp_grad_k.stride(2)),
+                                    static_cast<int>(tmp_grad_k.stride(3))};
+            p.tmp_grad_v_strides = {static_cast<int>(tmp_grad_v.stride(0)),
+                                    static_cast<int>(tmp_grad_v.stride(1)),
+                                    static_cast<int>(tmp_grad_v.stride(2)),
+                                    static_cast<int>(tmp_grad_v.stride(3))};
+        }
 
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+        if(bias.has_value())
+        {
+            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
+            p.has_attn_bias = true;
+            p.attn_bias_ptr = bias->data_ptr();
 
-      if (bias_requires_grad)
-        p.grad_bias_ptr = grad_bias.data_ptr();
-    } else {
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = nullptr;
-      p.grad_bias_ptr = nullptr;
-    }
+            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
 
-    p.bias_has_grad = bias_requires_grad;
+            p.attn_bias_strides = {static_cast<int>(bias_4d_view.stride(0)),
+                                   static_cast<int>(bias_4d_view.stride(1)),
+                                   static_cast<int>(bias_4d_view.stride(2)),
+                                   static_cast<int>(bias_4d_view.stride(3))};
 
-    p.custom_mask_type = custom_mask_type;
+            if(bias_requires_grad)
+                p.grad_bias_ptr = grad_bias.data_ptr();
+        }
+        else
+        {
+            p.has_attn_bias = true;
+            p.attn_bias_ptr = nullptr;
+            p.grad_bias_ptr = nullptr;
+        }
 
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.philox_seed = rng_seed;
-    p.philox_offset = rng_offset;
+        p.bias_has_grad = bias_requires_grad;
 
-    p.logsumexp_ptr = logsumexp.data_ptr();
-  };
+        p.custom_mask_type = custom_mask_type;
 
-  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
+        p.dropout_prob  = static_cast<float>(dropout_p);
+        p.philox_seed   = rng_seed;
+        p.philox_offset = rng_offset;
 
-    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
-    p.is_mqa_gqa = is_mqa_gqa;
+        p.logsumexp_ptr = logsumexp.data_ptr();
+    };
 
-    p.max_seqlen_q = *max_seqlen_q_;
+    auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
+        p.num_batches = seqstart_q->size(0) - 1;
+        p.M           = M;
+        p.N           = N;
+        p.Hq          = Hq;
+        p.Hkv         = Hkv;
+        p.K           = K;
+        p.Kv          = Kv;
 
-    TORCH_CHECK(p.num_batches == logsumexp.size(0));
-    TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
+        p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+        p.is_mqa_gqa        = is_mqa_gqa;
 
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
+        p.max_seqlen_q = *max_seqlen_q_;
 
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (is_mqa_gqa) {
-      p.tmp_grad_k_strides = {
-          static_cast<int>(tmp_grad_k.stride(1)),
-          static_cast<int>(tmp_grad_k.stride(2)),
-          static_cast<int>(tmp_grad_k.stride(3))};
-      p.tmp_grad_v_strides = {
-          static_cast<int>(tmp_grad_v.stride(1)),
-          static_cast<int>(tmp_grad_v.stride(2)),
-          static_cast<int>(tmp_grad_v.stride(3))};
-    };
+        TORCH_CHECK(p.num_batches == logsumexp.size(0));
+        TORCH_CHECK(p.Hq == logsumexp.size(1));
+        TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
+
+        if(scale.has_value())
+        {
+            p.scale = float(*scale);
+        }
+        else
+        {
+            p.scale = float(1.0 / std::sqrt(float(K)));
+        }
+
+        p.q_strides   = {static_cast<int>(query.stride(1)),
+                       static_cast<int>(query.stride(2)),
+                       static_cast<int>(query.stride(3))};
+        p.k_strides   = {static_cast<int>(key.stride(1)),
+                       static_cast<int>(key.stride(2)),
+                       static_cast<int>(key.stride(3))};
+        p.v_strides   = {static_cast<int>(value.stride(1)),
+                       static_cast<int>(value.stride(2)),
+                       static_cast<int>(value.stride(3))};
+        p.out_strides = {static_cast<int>(out.stride(1)),
+                         static_cast<int>(out.stride(2)),
+                         static_cast<int>(out.stride(3))};
+
+        if(is_mqa_gqa)
+        {
+            p.tmp_grad_k_strides = {static_cast<int>(tmp_grad_k.stride(1)),
+                                    static_cast<int>(tmp_grad_k.stride(2)),
+                                    static_cast<int>(tmp_grad_k.stride(3))};
+            p.tmp_grad_v_strides = {static_cast<int>(tmp_grad_v.stride(1)),
+                                    static_cast<int>(tmp_grad_v.stride(2)),
+                                    static_cast<int>(tmp_grad_v.stride(3))};
+        };
+
+        if(bias.has_value())
+        {
+            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+            p.has_attn_bias               = true;
+            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
+                                   static_cast<int>(bias_4d_view.stride(1)),
+                                   static_cast<int>(bias_4d_view.stride(2)),
+                                   static_cast<int>(bias_4d_view.stride(3))};
+        }
+        else
+            p.has_attn_bias = false;
+
+        p.bias_has_grad = bias_requires_grad;
 
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+        p.dropout_prob  = static_cast<float>(dropout_p);
+        p.philox_seed   = rng_seed;
+        p.philox_offset = rng_offset;
 
-      p.has_attn_bias = true;
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
+        p.custom_mask_type = custom_mask_type;
 
-    p.bias_has_grad = bias_requires_grad;
+        p.host_seqstart_q.resize(p.num_batches + 1);
+        p.host_seqstart_k.resize(p.num_batches + 1);
 
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.philox_seed = rng_seed;
-    p.philox_offset = rng_offset;
+        for(int i = 0; i < p.host_seqstart_q.size(); i++)
+            p.host_seqstart_q[i] = *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
 
-    p.custom_mask_type = custom_mask_type;
+        for(int i = 0; i < p.host_seqstart_k.size(); i++)
+            p.host_seqstart_k[i] = *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
 
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
+        if(seqlen_k.has_value())
+        {
+            TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+            TORCH_CHECK(seqlen_k->dim() == 1);
+            TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+            CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
 
-    for (int i = 0; i < p.host_seqstart_q.size(); i++)
-      p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
+            p.host_seqlen_k.resize(p.num_batches);
 
-    for (int i = 0; i < p.host_seqstart_k.size(); i++)
-      p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
+            for(int i = 0; i < p.host_seqlen_k.size(); i++)
+                p.host_seqlen_k[i] = *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
+        }
 
-    if (seqlen_k.has_value()) {
-      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-      TORCH_CHECK(seqlen_k->dim() == 1);
-      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
+        char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+        char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+        char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+        char* out_ptr      = reinterpret_cast<char*>(out.data_ptr());
+        char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
+        char* attn_bias_ptr =
+            bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
+
+        char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+
+        char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
+        char* grad_k_ptr = is_mqa_gqa ? reinterpret_cast<char*>(tmp_grad_k.data_ptr())
+                                      : reinterpret_cast<char*>(grad_k.data_ptr());
+        char* grad_v_ptr = is_mqa_gqa ? reinterpret_cast<char*>(tmp_grad_v.data_ptr())
+                                      : reinterpret_cast<char*>(grad_v.data_ptr());
+        char* grad_bias_ptr =
+            bias_requires_grad ? reinterpret_cast<char*>(grad_bias.data_ptr()) : nullptr;
+
+        size_t multiplier = 1;
+
+        if(p.use_fp32_qkv_grad)
+            multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
+                         get_size_in_bytes(1, query.scalar_type());
+
+        std::cout << "qkv-grad precision multiplier is " << multiplier << std::endl;
+
+        for(int i = 0; i < p.num_batches; i++)
+        {
+            size_t tmp_q_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0], query.scalar_type());
+            size_t tmp_k_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0], key.scalar_type());
+            size_t tmp_v_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0], value.scalar_type());
+            size_t tmp_o_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0], out.scalar_type());
+            size_t tmp_logsumexp_offset = get_size_in_bytes(
+                static_cast<size_t>(i) * p.Hq * p.max_seqlen_q, logsumexp.scalar_type());
+
+            size_t tmp_grad_k_offset =
+                is_mqa_gqa ? get_size_in_bytes(static_cast<size_t>(p.host_seqstart_k[i]) *
+                                                   p.tmp_grad_k_strides[0],
+                                               tmp_grad_k.scalar_type())
+                           : tmp_k_offset;
+            size_t tmp_grad_v_offset =
+                is_mqa_gqa ? get_size_in_bytes(static_cast<size_t>(p.host_seqstart_k[i]) *
+                                                   p.tmp_grad_v_strides[0],
+                                               tmp_grad_v.scalar_type())
+                           : tmp_v_offset;
+
+            p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
+            p.grad_q_ptrs.push_back(
+                reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
+
+            p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
+            p.grad_k_ptrs.push_back(
+                reinterpret_cast<void*>(&grad_k_ptr[tmp_grad_k_offset * multiplier]));
+
+            p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
+            p.grad_v_ptrs.push_back(
+                reinterpret_cast<void*>(&grad_v_ptr[tmp_grad_v_offset * multiplier]));
+
+            p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
+            p.grad_out_ptrs.push_back(reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
+
+            p.logsumexp_ptrs.push_back(
+                reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+
+            if(bias.has_value())
+            {
+                size_t tmp_bias_offset = get_size_in_bytes(
+                    static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                        static_cast<size_t>(p.host_seqstart_k[i]) * p.attn_bias_strides[3],
+                    bias->scalar_type());
+
+                p.attn_bias_ptrs.push_back(
+                    reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+
+                if(bias_requires_grad)
+                {
+                    p.grad_bias_ptrs.push_back(
+                        reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
+                }
+            }
+
+            // ToDO: remove this after dev-op fix
+            p.randvals_ptrs.push_back(nullptr);
+        }
+    };
 
-      p.host_seqlen_k.resize(p.num_batches);
+    auto inDataType = query.scalar_type();
 
-      for (int i = 0; i < p.host_seqlen_k.size(); i++)
-        p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
+    if(!seqstart_q.has_value())
+    { // input is batched
+        BatchedBackwardParams batched_backward_params;
+
+        set_batched_backward_params(batched_backward_params);
+
+        if(inDataType == at::ScalarType::Half)
+        {
+            batched_backward_fp16(batched_backward_params, stream);
+        }
+        else if(inDataType == at::ScalarType::BFloat16)
+        {
+            batched_backward_bp16(batched_backward_params, stream);
+        }
+        else
+            throw std::runtime_error("input data-type is not supported");
     }
+    else
+    { // input is grouped
+        GroupedBackwardParams grouped_backward_params;
 
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
-    char* attn_bias_ptr =
-        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-    char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
-    char* grad_k_ptr = is_mqa_gqa
-        ? reinterpret_cast<char*>(tmp_grad_k.data_ptr())
-        : reinterpret_cast<char*>(grad_k.data_ptr());
-    char* grad_v_ptr = is_mqa_gqa
-        ? reinterpret_cast<char*>(tmp_grad_v.data_ptr())
-        : reinterpret_cast<char*>(grad_v.data_ptr());
-    char* grad_bias_ptr = bias_requires_grad
-        ? reinterpret_cast<char*>(grad_bias.data_ptr())
-        : nullptr;
-
-    size_t multiplier = 1;
-
-    if (p.use_fp32_qkv_grad)
-      multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
-          get_size_in_bytes(1, query.scalar_type());
-
-    std::cout << "qkv-grad precision multiplier is " << multiplier << std::endl;
-
-    for (int i = 0; i < p.num_batches; i++) {
-      size_t tmp_q_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
-          query.scalar_type());
-      size_t tmp_k_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
-          key.scalar_type());
-      size_t tmp_v_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
-          value.scalar_type());
-      size_t tmp_o_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
-          out.scalar_type());
-      size_t tmp_logsumexp_offset = get_size_in_bytes(
-          static_cast<size_t>(i) * p.Hq * p.max_seqlen_q,
-          logsumexp.scalar_type());
-
-      size_t tmp_grad_k_offset = is_mqa_gqa
-          ? get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.tmp_grad_k_strides[0],
-                tmp_grad_k.scalar_type())
-          : tmp_k_offset;
-      size_t tmp_grad_v_offset = is_mqa_gqa
-          ? get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.tmp_grad_v_strides[0],
-                tmp_grad_v.scalar_type())
-          : tmp_v_offset;
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-      p.grad_q_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
-
-      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-      p.grad_k_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_k_ptr[tmp_grad_k_offset * multiplier]));
-
-      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-      p.grad_v_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_v_ptr[tmp_grad_v_offset * multiplier]));
-
-      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-      p.grad_out_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
-
-      p.logsumexp_ptrs.push_back(
-          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-
-      if (bias.has_value()) {
-        size_t tmp_bias_offset = get_size_in_bytes(
-            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(
-            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-
-        if (bias_requires_grad) {
-          p.grad_bias_ptrs.push_back(
-              reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
+        set_grouped_backward_params(grouped_backward_params);
+
+        if(inDataType == at::ScalarType::Half)
+        {
+            grouped_backward_fp16(grouped_backward_params, stream);
+        }
+        else if(inDataType == at::ScalarType::BFloat16)
+        {
+            grouped_backward_bp16(grouped_backward_params, stream);
         }
-      }
+        else
+            throw std::runtime_error("input data-type is not supported");
+    }
 
-      // ToDO: remove this after dev-op fix
-      p.randvals_ptrs.push_back(nullptr);
+    if(is_mqa_gqa)
+    {
+        auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
+        auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
+        grad_k               = tmp_grad_k_view.sum(3);
+        grad_v               = tmp_grad_v_view.sum(3);
     }
-  };
-
-  auto inDataType = query.scalar_type();
-
-  if (!seqstart_q.has_value()) { // input is batched
-    BatchedBackwardParams batched_backward_params;
-
-    set_batched_backward_params(batched_backward_params);
-
-    if (inDataType == at::ScalarType::Half) {
-      batched_backward_fp16(batched_backward_params, stream);
-    } else if (inDataType == at::ScalarType::BFloat16) {
-      batched_backward_bp16(batched_backward_params, stream);
-    } else
-      throw std::runtime_error("input data-type is not supported");
-  } else { // input is grouped
-    GroupedBackwardParams grouped_backward_params;
-
-    set_grouped_backward_params(grouped_backward_params);
-
-    if (inDataType == at::ScalarType::Half) {
-      grouped_backward_fp16(grouped_backward_params, stream);
-    } else if (inDataType == at::ScalarType::BFloat16) {
-      grouped_backward_bp16(grouped_backward_params, stream);
-    } else
-      throw std::runtime_error("input data-type is not supported");
-  }
-
-  if (is_mqa_gqa) {
-    auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
-    auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
-    grad_k = tmp_grad_k_view.sum(3);
-    grad_v = tmp_grad_v_view.sum(3);
-  }
-
-  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
+
+    return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 #endif
 } // namespace
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
-      TORCH_FN(efficient_attention_backward_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m)
+{
+    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
+           TORCH_FN(efficient_attention_backward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index ecf73c09b..a4282834a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -26,100 +26,91 @@ namespace {
  * generate a tensor with random uniform values. only used for testing, not much
  * attention is paid to performance
  */
-at::Tensor rand_uniform_int(
-    double dropout_prob,
-    const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
+at::Tensor
+rand_uniform_int(double dropout_prob,
+                 const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
 {
-  int B = out_pattern.size(0);
-  int num_heads = out_pattern.size(1);
-  int M = out_pattern.size(2);
-  int N = out_pattern.size(3);
-
-  // at::cuda::CUDAGuard device_guard(out_pattern.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  at::CUDAGeneratorImpl* gen =
-      at::get_generator_or_default<at::CUDAGeneratorImpl>(
-          c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-  at::PhiloxCudaState rng_engine_inputs;
-  {
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
-  }
-
-  const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-  int64_t philox_seed = std::get<0>(seeds);
-  int64_t philox_offset = std::get<1>(seeds);
-
-  at::Tensor randvals;
-
-  randvals = at::empty(
-      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
-  using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedDropout<
-      2, // NumDimG
-      ck::half_t,
-      int,
-      ck::half_t,
-      GemmSpec,
-      TensorSpecA,
-      TensorSpecB0,
-      TensorSpecB1,
-      TensorSpecC,
-      256, // BlockSize
-      64, // MPerBlock
-      128, // NPerBlock
-      32, // KPerBlock
-      8, // AK1
-      8, // BK1
-      32, // MPerXDL
-      32, // NPerXDL
-      2, // MXdlPerWave
-      1>; // NXdlPerWave
-
-  const uint64_t seed = 1;
-  const uint64_t offset = 0;
-
-  std::vector<ck::index_t> z_gs_ms_ns_lengths = {B, num_heads, M, N};
-  std::vector<ck::index_t> z_gs_ms_ns_strides = {
-      static_cast<int>(randvals.stride(0)),
-      static_cast<int>(randvals.stride(1)),
-      static_cast<int>(randvals.stride(2)),
-      static_cast<int>(randvals.stride(3))};
-
-  auto dropout_op = DeviceOpInstance();
-  auto dropout_invoker = dropout_op.MakeInvoker();
-
-  auto dropout_arg = dropout_op.MakeArgument(
-      static_cast<int*>(randvals.data_ptr()),
-      z_gs_ms_ns_lengths,
-      z_gs_ms_ns_strides,
-      {philox_seed, philox_offset});
-
-  dropout_invoker.Run(dropout_arg, StreamConfig{stream, false});
-  (void)hipStreamSynchronize(stream);
-
-  return randvals;
+    int B         = out_pattern.size(0);
+    int num_heads = out_pattern.size(1);
+    int M         = out_pattern.size(2);
+    int N         = out_pattern.size(3);
+
+    // at::cuda::CUDAGuard device_guard(out_pattern.device());
+    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+    at::CUDAGeneratorImpl* gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+    at::PhiloxCudaState rng_engine_inputs;
+    {
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+    }
+
+    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+    int64_t philox_seed   = std::get<0>(seeds);
+    int64_t philox_offset = std::get<1>(seeds);
+
+    at::Tensor randvals;
+
+    randvals = at::empty({B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+    static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecB0 =
+        ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecB1 =
+        ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default;
+
+    using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedDropout<2, // NumDimG
+                                                                                ck::half_t,
+                                                                                int,
+                                                                                ck::half_t,
+                                                                                GemmSpec,
+                                                                                TensorSpecA,
+                                                                                TensorSpecB0,
+                                                                                TensorSpecB1,
+                                                                                TensorSpecC,
+                                                                                256, // BlockSize
+                                                                                64,  // MPerBlock
+                                                                                128, // NPerBlock
+                                                                                32,  // KPerBlock
+                                                                                8,   // AK1
+                                                                                8,   // BK1
+                                                                                32,  // MPerXDL
+                                                                                32,  // NPerXDL
+                                                                                2,   // MXdlPerWave
+                                                                                1>;  // NXdlPerWave
+
+    const uint64_t seed   = 1;
+    const uint64_t offset = 0;
+
+    std::vector<ck::index_t> z_gs_ms_ns_lengths = {B, num_heads, M, N};
+    std::vector<ck::index_t> z_gs_ms_ns_strides = {static_cast<int>(randvals.stride(0)),
+                                                   static_cast<int>(randvals.stride(1)),
+                                                   static_cast<int>(randvals.stride(2)),
+                                                   static_cast<int>(randvals.stride(3))};
+
+    auto dropout_op      = DeviceOpInstance();
+    auto dropout_invoker = dropout_op.MakeInvoker();
+
+    auto dropout_arg = dropout_op.MakeArgument(static_cast<int*>(randvals.data_ptr()),
+                                               z_gs_ms_ns_lengths,
+                                               z_gs_ms_ns_strides,
+                                               {philox_seed, philox_offset});
+
+    dropout_invoker.Run(dropout_arg, StreamConfig{stream, false});
+    (void)hipStreamSynchronize(stream);
+
+    return randvals;
 } // namespace
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"),
-      TORCH_FN(rand_uniform_int));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m)
+{
+    m.impl(TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"), TORCH_FN(rand_uniform_int));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 42de5a540..da14882f7 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -1,7 +1,9 @@
 /*
-  TODO: license header
-*/
-
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
@@ -11,166 +13,166 @@
 #include "ck_attention_forward_decoder.h"
 
 namespace {
-  constexpr int32_t kThreadsPerWavefront = 64;
-  constexpr int32_t kWavefrontsPerBlock = 16;
-  constexpr int32_t D_H = 4 * kThreadsPerWavefront;
-}
+constexpr int32_t kThreadsPerWavefront = 64;
+constexpr int32_t kWavefrontsPerBlock  = 16;
+constexpr int32_t D_H                  = 4 * kThreadsPerWavefront;
+} // namespace
 
 namespace {
 
 template <typename c10_t>
 struct c10_to_data_t;
 template <>
-struct c10_to_data_t<float> {
-  using type = float;
+struct c10_to_data_t<float>
+{
+    using type = float;
 };
 
 template <>
-struct c10_to_data_t<c10::Half> {
-  using type = ck::half_t;
+struct c10_to_data_t<c10::Half>
+{
+    using type = ck::half_t;
 };
 
 template <>
-struct c10_to_data_t<c10::BFloat16> {
-  using type = ck::bhalf_t;
+struct c10_to_data_t<c10::BFloat16>
+{
+    using type = ck::bhalf_t;
 };
-}
+} // namespace
 
 namespace {
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
-  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
-  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
-  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
-
-#define AT_DISPATCH_SWITCH_3(                               \
-    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(                                       \
-      TYPE,                                                 \
-      NAME,                                                 \
-      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
-
-template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock,
+    AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+    AT_DISPATCH_SWITCH(                                                              \
+        TYPE, NAME, AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+template <int32_t ThreadsPerWavefront,
+          int32_t WavefrontsPerBlock,
           int32_t T_MAX = 8192,
-          int32_t D_H = 256>
-at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
-    const at::Tensor& XQ, // [B, 1, H, D]
-    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    at::optional<at::Tensor> seq_kv_lens, // [B]
-    double qk_scale,
-    at::Tensor& O) {
-  static_assert(4 * ThreadsPerWavefront == D_H, "");
-  static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
-
-  at::OptionalDeviceGuard guard(XQ.device());
-  TORCH_CHECK(XQ.is_cuda());
-  TORCH_CHECK(cache_K.is_cuda());
-  TORCH_CHECK(cache_V.is_cuda());
-
-  TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
-
-  TORCH_CHECK(cache_K.size(1) <= T_MAX);
-  TORCH_CHECK(cache_K.size(3) <= D_H);
-
-  auto B = XQ.size(0);
-  auto M = XQ.size(1);
-  auto H = XQ.size(2);
-
-  TORCH_CHECK(B <= 1024);
-  TORCH_CHECK(M <= 1024);
-  TORCH_CHECK(H <= 1024);
-
-  dim3 blocks(B, H, M);
-  dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
-
-  int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = D_H * sizeof(float) *
-      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-  const size_t lds_bytes = max(smem_softmax, smem_output);
-  auto stream = at::cuda::getCurrentHIPStream().stream();
-
-  AT_DISPATCH_SWITCH_3(
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      at::ScalarType::Float,
-      XQ.scalar_type(),
-      "efficient_attention_forward_decoder_ck",
-      [&] {
-        using ck_data_t = c10_to_data_t<scalar_t>::type;
-        using device_op_t =
-            ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<ck_data_t>;
-        auto op = device_op_t{};
-
-        auto XQ_acc =
-            XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
-        auto K_acc =
-            cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-        auto V_acc =
-            cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-        auto O_acc = O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
-        auto seq_acc = seq_kv_lens ?
-            seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data() : nullptr;
-        auto arg = device_op_t::Argument(
-            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
-            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
-            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
-            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-            seq_acc,
-            XQ_acc.stride(0),
-            XQ_acc.stride(1),
-            XQ_acc.stride(2),
-            K_acc.stride(0),
-            K_acc.stride(1),
-            K_acc.stride(2),
-            K_acc.size(1),
-            K_acc.size(3),
-            K_acc.size(2) == 1,
-            qk_scale,
-            blocks,
-            threads,
-            lds_bytes);
-
-        auto invoker = device_op_t::Invoker{};
-        (void)invoker.Run(arg, {stream});
-      });
-
-  return O;
+          int32_t D_H   = 256>
+at::Tensor&
+efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B, 1, H, D]
+                                                const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+                                                const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+                                                at::optional<at::Tensor> seq_kv_lens, // [B]
+                                                double qk_scale,
+                                                at::Tensor& O)
+{
+    static_assert(4 * ThreadsPerWavefront == D_H, "");
+    static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
+
+    at::OptionalDeviceGuard guard(XQ.device());
+    TORCH_CHECK(XQ.is_cuda());
+    TORCH_CHECK(cache_K.is_cuda());
+    TORCH_CHECK(cache_V.is_cuda());
+
+    TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
+
+    TORCH_CHECK(cache_K.size(1) <= T_MAX);
+    TORCH_CHECK(cache_K.size(3) <= D_H);
+
+    auto B = XQ.size(0);
+    auto M = XQ.size(1);
+    auto H = XQ.size(2);
+
+    TORCH_CHECK(B <= 1024);
+    TORCH_CHECK(M <= 1024);
+    TORCH_CHECK(H <= 1024);
+
+    dim3 blocks(B, H, M);
+    dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
+
+    int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
+    int32_t smem_output  = D_H * sizeof(float) *
+                          threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+    const size_t lds_bytes = max(smem_softmax, smem_output);
+    auto stream            = at::cuda::getCurrentHIPStream().stream();
+
+    AT_DISPATCH_SWITCH_3(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        at::ScalarType::Float,
+        XQ.scalar_type(),
+        "efficient_attention_forward_decoder_ck",
+        [&] {
+            using ck_data_t   = c10_to_data_t<scalar_t>::type;
+            using device_op_t = ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<ck_data_t>;
+            auto op           = device_op_t{};
+
+            auto XQ_acc = XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
+            auto K_acc  = cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+            auto V_acc  = cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
+            auto O_acc  = O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
+            auto seq_acc =
+                seq_kv_lens
+                    ? seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data()
+                    : nullptr;
+            auto arg = device_op_t::Argument(
+                reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+                reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+                reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+                seq_acc,
+                XQ_acc.stride(0),
+                XQ_acc.stride(1),
+                XQ_acc.stride(2),
+                K_acc.stride(0),
+                K_acc.stride(1),
+                K_acc.stride(2),
+                K_acc.size(1),
+                K_acc.size(3),
+                K_acc.size(2) == 1,
+                qk_scale,
+                blocks,
+                threads,
+                lds_bytes);
+
+            auto invoker = device_op_t::Invoker{};
+            (void)invoker.Run(arg, {stream});
+        });
+
+    return O;
 }
 
 #undef AT_DISPATCH_CASE_3
 #undef AT_DISPATCH_SWITCH_3
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
-at::Tensor efficient_attention_forward_decoder_ck_impl(
-    const at::Tensor& XQ, // [B, 1, H, D]
-    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    at::optional<at::Tensor> seq_kv_lens, // [B]
-    double qk_scale) {
-  auto O = at::empty_like(XQ);
-  efficient_attention_forward_decoder_ck_out_impl<
-      ThreadsPerWavefront,
-      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, O);
-  return O;
+at::Tensor
+efficient_attention_forward_decoder_ck_impl(const at::Tensor& XQ,      // [B, 1, H, D]
+                                            const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+                                            const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+                                            at::optional<at::Tensor> seq_kv_lens, // [B]
+                                            double qk_scale)
+{
+    auto O = at::empty_like(XQ);
+    efficient_attention_forward_decoder_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
+        XQ, cache_K, cache_V, seq_kv_lens, qk_scale, O);
+    return O;
 }
 
-at::Tensor efficient_attention_forward_decoder_ck(
-    const at::Tensor& XQ, // [B, 1, H, D]
-    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    at::optional<at::Tensor> seq_kv_lens, // [B]
-    double qk_scale) {
-  return efficient_attention_forward_decoder_ck_impl<
-      kThreadsPerWavefront,
-      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
+at::Tensor
+efficient_attention_forward_decoder_ck(const at::Tensor& XQ,      // [B, 1, H, D]
+                                       const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+                                       const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+                                       at::optional<at::Tensor> seq_kv_lens, // [B]
+                                       double qk_scale)
+{
+    return efficient_attention_forward_decoder_ck_impl<kThreadsPerWavefront, kWavefrontsPerBlock>(
+        XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
 }
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
-      TORCH_FN(efficient_attention_forward_decoder_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m)
+{
+    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
+           TORCH_FN(efficient_attention_forward_decoder_ck));
 }
 
 #ifdef ATTN_FWD_DECODER_MAIN
@@ -206,106 +208,106 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 // clang-format on
 
-static void do_correctness_check() {
-  const int32_t D = 4 * kThreadsPerWavefront;
-  const int32_t B = 1;
-  const int32_t H = 4;
-  auto options = torch::TensorOptions()
-                     .dtype(torch::kFloat32)
-                     .layout(torch::kStrided)
-                     .device(torch::kCUDA, 1)
-                     .requires_grad(false);
-  auto int_options = options.dtype(torch::kInt);
-  auto XQ = at::randn({B, 1, H, D}, options);
-  auto K = at::randn({B, 4096, H, D}, options);
-  auto V = at::randn({B, 4096, H, D}, options);
-  auto seq = at::randint(63, 128, {B}, int_options);
-  double qk_scale = 1. / sqrt(D);
-
-  auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(
-      XQ, K, V, seq, qk_scale);
-  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(
-      XQ, K, V, seq, qk_scale);
-  auto mask = at::isclose(
-      result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-  printf(
-      "Mismatched elements percentage: %.2f\n",
-      1. - percent_match.item<float>());
+static void do_correctness_check()
+{
+    const int32_t D = 4 * kThreadsPerWavefront;
+    const int32_t B = 1;
+    const int32_t H = 4;
+    auto options    = torch::TensorOptions()
+                       .dtype(torch::kFloat32)
+                       .layout(torch::kStrided)
+                       .device(torch::kCUDA, 1)
+                       .requires_grad(false);
+    auto int_options = options.dtype(torch::kInt);
+    auto XQ          = at::randn({B, 1, H, D}, options);
+    auto K           = at::randn({B, 4096, H, D}, options);
+    auto V           = at::randn({B, 4096, H, D}, options);
+    auto seq         = at::randint(63, 128, {B}, int_options);
+    double qk_scale  = 1. / sqrt(D);
+
+    auto result      = efficient_attention_forward_decoder_ck_impl<64, 1>(XQ, K, V, seq, qk_scale);
+    auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(XQ, K, V, seq, qk_scale);
+    auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+    printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
 }
 
-int main(int argc, char** argv) {
-  if (argc == 1) {
-    do_correctness_check();
-  } else {
-    const auto args = std::vector<std::string>(argv + 1, argv + argc);
-    if (args.size() != 7) {
-      std::cout
-          << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block"
-          << std::endl;
-      return 0;
+int main(int argc, char** argv)
+{
+    if(argc == 1)
+    {
+        do_correctness_check();
     }
-    const int32_t n_keys = std::stoi(args[0]);
-    const int32_t padding = std::stoi(args[1]);
-    const int32_t batch_size = std::stoi(args[2]);
-    const int32_t n_heads = std::stoi(args[3]);
-    const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32") ? torch::kFloat32
-        : (args[5] == "f16")              ? torch::kFloat16
-                                          : torch::kBFloat16;
-    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
-
-    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
-
-    const auto options = torch::TensorOptions()
-                             .dtype(dtype)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA, 1)
-                             .requires_grad(false);
-
-    const auto int_options = options.dtype(torch::kInt);
-    const auto Q = at::rand({batch_size, 1, n_heads, dim_per_head}, options);
-    const auto K = multiquery
-        ? at::rand({batch_size, padding, 1, dim_per_head}, options)
-              .expand({batch_size, padding, n_heads, dim_per_head})
-        : at::rand({batch_size, padding, n_heads, dim_per_head}, options);
-    const auto V = at::rand_like(K);
-    auto O = at::rand_like(Q);
-
-    const auto seq = at::randint(1, n_keys, {batch_size}, int_options);
-    const double qk_scale = 1. / sqrt(dim_per_head);
-    auto call_ptr = decltype(&efficient_attention_forward_decoder_ck_out_impl<
-                             kThreadsPerWavefront,
-                             kWavefrontsPerBlock>){};
-
-#define SWITCH_CASE_SET_CALLPTR(n)                               \
-  case (n):                                                      \
-    call_ptr = &efficient_attention_forward_decoder_ck_out_impl< \
-        kThreadsPerWavefront,                                    \
-        (n)>;                                                    \
-    break;
-
-    switch (n_wavefronts_per_block) {
-      SWITCH_CASE_SET_CALLPTR(1);
-      SWITCH_CASE_SET_CALLPTR(2);
-      SWITCH_CASE_SET_CALLPTR(4);
-      SWITCH_CASE_SET_CALLPTR(8);
-      SWITCH_CASE_SET_CALLPTR(16);
-
-      default:
-        call_ptr = nullptr;
+    else
+    {
+        const auto args = std::vector<std::string>(argv + 1, argv + argc);
+        if(args.size() != 7)
+        {
+            std::cout << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype "
+                         "n_wavefronts_per_block"
+                      << std::endl;
+            return 0;
+        }
+        const int32_t n_keys     = std::stoi(args[0]);
+        const int32_t padding    = std::stoi(args[1]);
+        const int32_t batch_size = std::stoi(args[2]);
+        const int32_t n_heads    = std::stoi(args[3]);
+        const int32_t multiquery = (args[4] == "mq");
+        const auto dtype         = (args[5] == "f32")
+                               ? torch::kFloat32
+                               : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+        const int32_t n_wavefronts_per_block = std::stoi(args[6]);
+
+        const int32_t dim_per_head = 4 * kThreadsPerWavefront;
+
+        const auto options = torch::TensorOptions()
+                                 .dtype(dtype)
+                                 .layout(torch::kStrided)
+                                 .device(torch::kCUDA, 1)
+                                 .requires_grad(false);
+
+        const auto int_options = options.dtype(torch::kInt);
+        const auto Q           = at::rand({batch_size, 1, n_heads, dim_per_head}, options);
+        const auto K = multiquery ? at::rand({batch_size, padding, 1, dim_per_head}, options)
+                                        .expand({batch_size, padding, n_heads, dim_per_head})
+                                  : at::rand({batch_size, padding, n_heads, dim_per_head}, options);
+        const auto V = at::rand_like(K);
+        auto O       = at::rand_like(Q);
+
+        const auto seq        = at::randint(1, n_keys, {batch_size}, int_options);
+        const double qk_scale = 1. / sqrt(dim_per_head);
+        auto call_ptr =
+            decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront,
+                                                                      kWavefrontsPerBlock>){};
+
+#define SWITCH_CASE_SET_CALLPTR(n)                                                              \
+    case(n):                                                                                    \
+        call_ptr = &efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, (n)>; \
         break;
-    }
+
+        switch(n_wavefronts_per_block)
+        {
+            SWITCH_CASE_SET_CALLPTR(1);
+            SWITCH_CASE_SET_CALLPTR(2);
+            SWITCH_CASE_SET_CALLPTR(4);
+            SWITCH_CASE_SET_CALLPTR(8);
+            SWITCH_CASE_SET_CALLPTR(16);
+
+        default: call_ptr = nullptr; break;
+        }
 #undef SWITCH_CASE_SET_CALLPTR
 
-    if (call_ptr) {
-      call_ptr(Q, K, V, seq, qk_scale, O);
-    } else {
-      std::cout << "Warning: no kernel was found for wavefronts_per_block="
-                << n_wavefronts_per_block << std::endl;
+        if(call_ptr)
+        {
+            call_ptr(Q, K, V, seq, qk_scale, O);
+        }
+        else
+        {
+            std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                      << n_wavefronts_per_block << std::endl;
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 
-#endif // MAIN
\ No newline at end of file
+#endif // MAIN
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index aaafa1b3b..244e134a4 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <cmath>
 #include <mutex>
 
@@ -17,18 +23,10 @@
 #include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
-extern void batched_forward_fp16(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-extern void batched_forward_bp16(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-extern void grouped_forward_fp16(
-    GroupedForwardParams& param,
-    hipStream_t stream);
-extern void grouped_forward_bp16(
-    GroupedForwardParams& param,
-    hipStream_t stream);
+extern void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream);
+extern void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream);
+extern void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream);
+extern void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream);
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
 extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
@@ -42,11 +40,10 @@ namespace {
   (Mode BMHK) With all the heads having the same seqlen
   (Mode 1MHK) `batch=1` with all tokens across batches concatenated
 */
-std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
-efficient_attention_forward_ck(
-    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
-    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
-    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward_ck(
+    const at::Tensor& query,               // [b, seqlen, num_heads_q, K]
+    const at::Tensor& key,                 // [b, seqlen, num_heads_kv, K]
+    const at::Tensor& value,               // [b, seqlen, num_heads_kv, Kv]
     const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
@@ -60,358 +57,378 @@ efficient_attention_forward_ck(
     bool compute_logsumexp,
     int64_t custom_mask_type,
     c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k) {
-  TORCH_CHECK(query.dim() == 4);
-  TORCH_CHECK(key.dim() == 4);
-  TORCH_CHECK(value.dim() == 4);
-
-  // Batch sizes
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // Sequence length
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) % key.size(2) == 0);
-  TORCH_CHECK(key.size(2) == value.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-
-  TORCH_CHECK(query.scalar_type() == key.scalar_type());
-  TORCH_CHECK(query.scalar_type() == value.scalar_type());
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-  };
-
-  // last dim is contiguous, device is kCUDA
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t Hq = query.size(-2);
-  int64_t Hkv = key.size(-2);
-  int64_t K = query.size(-1);
-  int64_t Kv = value.size(-1);
-
-  auto opts = query.options();
-
-  at::Tensor logsumexp;
-
-  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
-
-  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  if (use_dropout) {
-    at::PhiloxCudaState rng_engine_inputs;
-    at::CUDAGeneratorImpl* gen =
-        at::get_generator_or_default<at::CUDAGeneratorImpl>(
+    const c10::optional<at::Tensor>& seqlen_k)
+{
+    TORCH_CHECK(query.dim() == 4);
+    TORCH_CHECK(key.dim() == 4);
+    TORCH_CHECK(value.dim() == 4);
+
+    // Batch sizes
+    TORCH_CHECK(query.size(0) == key.size(0));
+    TORCH_CHECK(query.size(0) == value.size(0));
+
+    // Sequence length
+    TORCH_CHECK(key.size(1) == value.size(1));
+
+    // Num heads
+    TORCH_CHECK(query.size(2) % key.size(2) == 0);
+    TORCH_CHECK(key.size(2) == value.size(2));
+
+    // Embedding per head
+    TORCH_CHECK(query.size(3) == key.size(3));
+
+    TORCH_CHECK(query.scalar_type() == key.scalar_type());
+    TORCH_CHECK(query.scalar_type() == value.scalar_type());
+
+    TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+    if(seqstart_q.has_value())
+    {
+        TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
+        TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+        TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+        TORCH_CHECK(max_seqlen_q_.has_value());
+    };
+
+    // last dim is contiguous, device is kCUDA
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+    // at::cuda::CUDAGuard device_guard(query.device());
+    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+    int64_t B   = query.size(0);
+    int64_t M   = query.size(1);
+    int64_t N   = key.size(1);
+    int64_t Hq  = query.size(-2);
+    int64_t Hkv = key.size(-2);
+    int64_t K   = query.size(-1);
+    int64_t Kv  = value.size(-1);
+
+    auto opts = query.options();
+
+    at::Tensor logsumexp;
+
+    at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
+
+    const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+    int64_t philox_seed;
+    int64_t philox_offset;
+
+    if(use_dropout)
+    {
+        at::PhiloxCudaState rng_engine_inputs;
+        at::CUDAGeneratorImpl* gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
             c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
 
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    // if using dropout, we produce 1 random number for each element of the
-    // attention tensor
-    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
-
-    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-    philox_seed = std::get<0>(seeds);
-    philox_offset = std::get<1>(seeds);
-  }
-
-  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        // if using dropout, we produce 1 random number for each element of the
+        // attention tensor
+        rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
 
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(0)),
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.use_dropout = use_dropout;
-    p.philox_seed = philox_seed;
-    p.philox_offset = philox_offset;
-    p.compute_logsumexp = compute_logsumexp;
-
-    // the following parameters are only used by training forward
-    if (p.use_dropout)
-      p.dropout_prob = static_cast<float>(dropout_p);
-    else
-      p.dropout_prob = 0.0f;
-
-    if (p.compute_logsumexp) {
-      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
-      p.logsumexp_ptr = logsumexp.data_ptr();
-    } else
-      p.logsumexp_ptr = nullptr;
-  };
-
-  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
+        const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
 
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.custom_mask_type = custom_mask_type;
-
-    // max_seqlen_q is used to create logsumexp tensor
-    p.max_seqlen_q = *max_seqlen_q_;
-
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
-
-    for (int i = 0; i < p.host_seqstart_q.size(); i++)
-      p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
-
-    for (int i = 0; i < p.host_seqstart_k.size(); i++)
-      p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
-
-    if (seqlen_k.has_value()) {
-      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-      TORCH_CHECK(seqlen_k->dim() == 1);
-      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
-
-      p.host_seqlen_k.resize(p.num_batches);
-
-      for (int i = 0; i < p.host_seqlen_k.size(); i++)
-        p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
+        philox_seed   = std::get<0>(seeds);
+        philox_offset = std::get<1>(seeds);
     }
 
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* attn_bias_ptr =
-        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-    for (int i = 0; i < p.num_batches; i++) {
-      size_t tmp_q_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
-          query.scalar_type());
-      size_t tmp_k_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
-          key.scalar_type());
-      size_t tmp_v_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
-          value.scalar_type());
-      size_t tmp_o_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
-          out.scalar_type());
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-
-      if (bias.has_value()) {
-        size_t tmp_bias_offset = get_size_in_bytes(
-            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(
-            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-      };
-
-      // ToDO: remove this after dev-op fix
-      p.randvals_ptrs.push_back(nullptr);
-    }
+    auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+        p.B   = B;
+        p.M   = M;
+        p.N   = N;
+        p.Hq  = Hq;
+        p.Hkv = Hkv;
+        p.K   = K;
+        p.Kv  = Kv;
+
+        if(scale.has_value())
+        {
+            p.scale = float(*scale);
+        }
+        else
+        {
+            p.scale = float(1.0 / std::sqrt(float(K)));
+        }
+
+        p.q_ptr   = query.data_ptr();
+        p.k_ptr   = key.data_ptr();
+        p.v_ptr   = value.data_ptr();
+        p.out_ptr = out.data_ptr();
+
+        p.q_strides   = {static_cast<int>(query.stride(0)),
+                       static_cast<int>(query.stride(1)),
+                       static_cast<int>(query.stride(2)),
+                       static_cast<int>(query.stride(3))};
+        p.k_strides   = {static_cast<int>(key.stride(0)),
+                       static_cast<int>(key.stride(1)),
+                       static_cast<int>(key.stride(2)),
+                       static_cast<int>(key.stride(3))};
+        p.v_strides   = {static_cast<int>(value.stride(0)),
+                       static_cast<int>(value.stride(1)),
+                       static_cast<int>(value.stride(2)),
+                       static_cast<int>(value.stride(3))};
+        p.out_strides = {static_cast<int>(out.stride(0)),
+                         static_cast<int>(out.stride(1)),
+                         static_cast<int>(out.stride(2)),
+                         static_cast<int>(out.stride(3))};
+
+        if(bias.has_value())
+        {
+            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+            p.has_attn_bias = true;
+            p.attn_bias_ptr = bias->data_ptr();
+
+            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
+                                   static_cast<int>(bias_4d_view.stride(1)),
+                                   static_cast<int>(bias_4d_view.stride(2)),
+                                   static_cast<int>(bias_4d_view.stride(3))};
+        }
+        else
+            p.has_attn_bias = false;
+
+        p.custom_mask_type = custom_mask_type;
+
+        p.use_dropout       = use_dropout;
+        p.philox_seed       = philox_seed;
+        p.philox_offset     = philox_offset;
+        p.compute_logsumexp = compute_logsumexp;
+
+        // the following parameters are only used by training forward
+        if(p.use_dropout)
+            p.dropout_prob = static_cast<float>(dropout_p);
+        else
+            p.dropout_prob = 0.0f;
+
+        if(p.compute_logsumexp)
+        {
+            logsumexp       = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
+            p.logsumexp_ptr = logsumexp.data_ptr();
+        }
+        else
+            p.logsumexp_ptr = nullptr;
+    };
 
-    p.use_dropout = use_dropout;
-    p.philox_seed = philox_seed;
-    p.philox_offset = philox_offset;
-    p.compute_logsumexp = compute_logsumexp;
+    auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+        p.num_batches = seqstart_q->size(0) - 1;
+        p.M           = M;
+        p.N           = N;
+        p.Hq          = Hq;
+        p.Hkv         = Hkv;
+        p.K           = K;
+        p.Kv          = Kv;
+
+        if(scale.has_value())
+        {
+            p.scale = float(*scale);
+        }
+        else
+        {
+            p.scale = float(1.0 / std::sqrt(float(K)));
+        }
+
+        p.q_strides   = {static_cast<int>(query.stride(1)),
+                       static_cast<int>(query.stride(2)),
+                       static_cast<int>(query.stride(3))};
+        p.k_strides   = {static_cast<int>(key.stride(1)),
+                       static_cast<int>(key.stride(2)),
+                       static_cast<int>(key.stride(3))};
+        p.v_strides   = {static_cast<int>(value.stride(1)),
+                       static_cast<int>(value.stride(2)),
+                       static_cast<int>(value.stride(3))};
+        p.out_strides = {static_cast<int>(out.stride(1)),
+                         static_cast<int>(out.stride(2)),
+                         static_cast<int>(out.stride(3))};
+
+        if(bias.has_value())
+        {
+            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+            p.has_attn_bias               = true;
+            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
+                                   static_cast<int>(bias_4d_view.stride(1)),
+                                   static_cast<int>(bias_4d_view.stride(2)),
+                                   static_cast<int>(bias_4d_view.stride(3))};
+        }
+        else
+            p.has_attn_bias = false;
+
+        p.custom_mask_type = custom_mask_type;
+
+        // max_seqlen_q is used to create logsumexp tensor
+        p.max_seqlen_q = *max_seqlen_q_;
+
+        p.host_seqstart_q.resize(p.num_batches + 1);
+        p.host_seqstart_k.resize(p.num_batches + 1);
+
+        for(int i = 0; i < p.host_seqstart_q.size(); i++)
+            p.host_seqstart_q[i] = *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
+
+        for(int i = 0; i < p.host_seqstart_k.size(); i++)
+            p.host_seqstart_k[i] = *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
+
+        if(seqlen_k.has_value())
+        {
+            TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+            TORCH_CHECK(seqlen_k->dim() == 1);
+            TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+            CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
+
+            p.host_seqlen_k.resize(p.num_batches);
+
+            for(int i = 0; i < p.host_seqlen_k.size(); i++)
+                p.host_seqlen_k[i] = *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
+        }
+
+        char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+        char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+        char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+        char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+        char* attn_bias_ptr =
+            bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
+
+        for(int i = 0; i < p.num_batches; i++)
+        {
+            size_t tmp_q_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0], query.scalar_type());
+            size_t tmp_k_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0], key.scalar_type());
+            size_t tmp_v_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0], value.scalar_type());
+            size_t tmp_o_offset = get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0], out.scalar_type());
+
+            p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
+            p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
+            p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
+            p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
+
+            if(bias.has_value())
+            {
+                size_t tmp_bias_offset = get_size_in_bytes(
+                    static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                        static_cast<size_t>(p.host_seqstart_k[i]) * p.attn_bias_strides[3],
+                    bias->scalar_type());
+
+                p.attn_bias_ptrs.push_back(
+                    reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+            };
+
+            // ToDO: remove this after dev-op fix
+            p.randvals_ptrs.push_back(nullptr);
+        }
+
+        p.use_dropout       = use_dropout;
+        p.philox_seed       = philox_seed;
+        p.philox_offset     = philox_offset;
+        p.compute_logsumexp = compute_logsumexp;
+
+        // the following parameters are only used by training forward
+        if(p.use_dropout)
+            p.dropout_prob = static_cast<float>(dropout_p);
+        else
+            p.dropout_prob = 0.0f;
+
+        if(p.compute_logsumexp)
+        {
+            logsumexp = at::empty({p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+            char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+
+            for(int i = 0; i < p.num_batches; i++)
+            {
+                size_t tmp_logsumexp_offset = get_size_in_bytes(
+                    static_cast<size_t>(i) * Hq * p.max_seqlen_q, logsumexp.scalar_type());
+                p.logsumexp_ptrs.push_back(
+                    reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+            };
+        };
+    };
 
-    // the following parameters are only used by training forward
-    if (p.use_dropout)
-      p.dropout_prob = static_cast<float>(dropout_p);
+    auto inDataType = query.scalar_type();
+
+    if(!seqstart_q.has_value())
+    { // input is batched
+        BatchedForwardParams batched_forward_params;
+
+        set_batched_forward_params(batched_forward_params);
+
+        if(!batched_forward_params.use_dropout && !batched_forward_params.compute_logsumexp)
+        {
+            if(inDataType == at::ScalarType::Half)
+            {
+                batched_infer_fp16(batched_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                batched_infer_bp16(batched_forward_params, stream);
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+        }
+        else
+        {
+            if(inDataType == at::ScalarType::Half)
+            {
+                batched_forward_fp16(batched_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                batched_forward_bp16(batched_forward_params, stream);
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+        };
+    }
     else
-      p.dropout_prob = 0.0f;
-
-    if (p.compute_logsumexp) {
-      logsumexp = at::empty(
-          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
-      char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-      for (int i = 0; i < p.num_batches; i++) {
-        size_t tmp_logsumexp_offset = get_size_in_bytes(
-            static_cast<size_t>(i) * Hq * p.max_seqlen_q,
-            logsumexp.scalar_type());
-        p.logsumexp_ptrs.push_back(
-            reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-      };
-    };
-  };
-
-  auto inDataType = query.scalar_type();
-
-  if (!seqstart_q.has_value()) { // input is batched
-    BatchedForwardParams batched_forward_params;
-
-    set_batched_forward_params(batched_forward_params);
-
-    if (!batched_forward_params.use_dropout &&
-        !batched_forward_params.compute_logsumexp) {
-      if (inDataType == at::ScalarType::Half) {
-        batched_infer_fp16(batched_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_infer_bp16(batched_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else {
-      if (inDataType == at::ScalarType::Half) {
-        batched_forward_fp16(batched_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_forward_bp16(batched_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    };
-  } else { // input is grouped
-    GroupedForwardParams grouped_forward_params;
-
-    set_grouped_forward_params(grouped_forward_params);
-
-    if (!grouped_forward_params.use_dropout &&
-        !grouped_forward_params.compute_logsumexp) {
-      if (inDataType == at::ScalarType::Half) {
-        grouped_infer_fp16(grouped_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_infer_bp16(grouped_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else {
-      if (inDataType == at::ScalarType::Half) {
-        grouped_forward_fp16(grouped_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_forward_bp16(grouped_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
+    { // input is grouped
+        GroupedForwardParams grouped_forward_params;
+
+        set_grouped_forward_params(grouped_forward_params);
+
+        if(!grouped_forward_params.use_dropout && !grouped_forward_params.compute_logsumexp)
+        {
+            if(inDataType == at::ScalarType::Half)
+            {
+                grouped_infer_fp16(grouped_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                grouped_infer_bp16(grouped_forward_params, stream);
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+        }
+        else
+        {
+            if(inDataType == at::ScalarType::Half)
+            {
+                grouped_forward_fp16(grouped_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                grouped_forward_bp16(grouped_forward_params, stream);
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+        };
     };
-  };
 
-  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
+    return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
-      TORCH_FN(efficient_attention_forward_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m)
+{
+    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
+           TORCH_FN(efficient_attention_forward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index e392935ce..922f82909 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <cmath>
 #include <mutex>
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_align_switch.h b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
index edd49290b..f3dd9dbbe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_align_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
@@ -1,145 +1,171 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <stdexcept>
 
 // assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_1(CONST_ALIGN_MAX1, CONST_ALIGN_NAME1, LENGTH1, ...)   \
-  [&] {                                                                     \
-    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
-      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
-        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
-        __VA_ARGS__();                                                      \
-      } else {                                                              \
-        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
-          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
-            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
-            __VA_ARGS__();                                                  \
-          } else {                                                          \
-            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
-              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
-                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
-                    CONST_ALIGN_MAX1 / 4;                                   \
-                __VA_ARGS__();                                              \
-              } else {                                                      \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
-                __VA_ARGS__();                                              \
-              };                                                            \
-            }                                                               \
-          };                                                                \
-        }                                                                   \
-      };                                                                    \
-    }                                                                       \
-  }()
+#define ALIGN_SWITCH_1(CONST_ALIGN_MAX1, CONST_ALIGN_NAME1, LENGTH1, ...)                       \
+    [&] {                                                                                       \
+        if constexpr(CONST_ALIGN_MAX1 > 0)                                                      \
+        {                                                                                       \
+            if(LENGTH1 % CONST_ALIGN_MAX1 == 0)                                                 \
+            {                                                                                   \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;                     \
+                __VA_ARGS__();                                                                  \
+            }                                                                                   \
+            else                                                                                \
+            {                                                                                   \
+                if constexpr(CONST_ALIGN_MAX1 / 2 > 0)                                          \
+                {                                                                               \
+                    if(LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0)                                   \
+                    {                                                                           \
+                        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;         \
+                        __VA_ARGS__();                                                          \
+                    }                                                                           \
+                    else                                                                        \
+                    {                                                                           \
+                        if constexpr(CONST_ALIGN_MAX1 / 4 > 0)                                  \
+                        {                                                                       \
+                            if(LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0)                           \
+                            {                                                                   \
+                                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 4; \
+                                __VA_ARGS__();                                                  \
+                            }                                                                   \
+                            else                                                                \
+                            {                                                                   \
+                                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                    \
+                                __VA_ARGS__();                                                  \
+                            };                                                                  \
+                        }                                                                       \
+                    };                                                                          \
+                }                                                                               \
+            };                                                                                  \
+        }                                                                                       \
+    }()
 
 // assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_2(                                                       \
-    CONST_ALIGN_MAX1,                                                         \
-    CONST_ALIGN_NAME1,                                                        \
-    LENGTH1,                                                                  \
-    CONST_ALIGN_MAX2,                                                         \
-    CONST_ALIGN_NAME2,                                                        \
-    LENGTH2,                                                                  \
-    ...)                                                                      \
-  [&] {                                                                       \
-    if constexpr (CONST_ALIGN_MAX1 > 0) {                                     \
-      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                  \
-        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;           \
-        ALIGN_SWITCH_1(                                                       \
-            CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);     \
-      } else {                                                                \
-        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                             \
-          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                        \
-            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;   \
-            ALIGN_SWITCH_1(                                                   \
-                CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
-          } else {                                                            \
-            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                         \
-              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                    \
-                constexpr ck::index_t CONST_ALIGN_NAME1 =                     \
-                    CONST_ALIGN_MAX1 / 4;                                     \
-                ALIGN_SWITCH_1(                                               \
-                    CONST_ALIGN_MAX2,                                         \
-                    CONST_ALIGN_NAME2,                                        \
-                    LENGTH2,                                                  \
-                    ##__VA_ARGS__);                                           \
-              } else {                                                        \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                  \
-                ALIGN_SWITCH_1(                                               \
-                    CONST_ALIGN_MAX2,                                         \
-                    CONST_ALIGN_NAME2,                                        \
-                    LENGTH2,                                                  \
-                    ##__VA_ARGS__);                                           \
-              };                                                              \
-            }                                                                 \
-          };                                                                  \
-        }                                                                     \
-      };                                                                      \
-    }                                                                         \
-  }()
+#define ALIGN_SWITCH_2(CONST_ALIGN_MAX1,                                                          \
+                       CONST_ALIGN_NAME1,                                                         \
+                       LENGTH1,                                                                   \
+                       CONST_ALIGN_MAX2,                                                          \
+                       CONST_ALIGN_NAME2,                                                         \
+                       LENGTH2,                                                                   \
+                       ...)                                                                       \
+    [&] {                                                                                         \
+        if constexpr(CONST_ALIGN_MAX1 > 0)                                                        \
+        {                                                                                         \
+            if(LENGTH1 % CONST_ALIGN_MAX1 == 0)                                                   \
+            {                                                                                     \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;                       \
+                ALIGN_SWITCH_1(CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);      \
+            }                                                                                     \
+            else                                                                                  \
+            {                                                                                     \
+                if constexpr(CONST_ALIGN_MAX1 / 2 > 0)                                            \
+                {                                                                                 \
+                    if(LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0)                                     \
+                    {                                                                             \
+                        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;           \
+                        ALIGN_SWITCH_1(                                                           \
+                            CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);         \
+                    }                                                                             \
+                    else                                                                          \
+                    {                                                                             \
+                        if constexpr(CONST_ALIGN_MAX1 / 4 > 0)                                    \
+                        {                                                                         \
+                            if(LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0)                             \
+                            {                                                                     \
+                                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 4;   \
+                                ALIGN_SWITCH_1(                                                   \
+                                    CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
+                            }                                                                     \
+                            else                                                                  \
+                            {                                                                     \
+                                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                      \
+                                ALIGN_SWITCH_1(                                                   \
+                                    CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
+                            };                                                                    \
+                        }                                                                         \
+                    };                                                                            \
+                }                                                                                 \
+            };                                                                                    \
+        }                                                                                         \
+    }()
 
 // assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_3(                                                     \
-    CONST_ALIGN_MAX1,                                                       \
-    CONST_ALIGN_NAME1,                                                      \
-    LENGTH1,                                                                \
-    CONST_ALIGN_MAX2,                                                       \
-    CONST_ALIGN_NAME2,                                                      \
-    LENGTH2,                                                                \
-    CONST_ALIGN_MAX3,                                                       \
-    CONST_ALIGN_NAME3,                                                      \
-    LENGTH3,                                                                \
-    ...)                                                                    \
-  [&] {                                                                     \
-    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
-      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
-        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
-        ALIGN_SWITCH_2(                                                     \
-            CONST_ALIGN_MAX2,                                               \
-            CONST_ALIGN_NAME2,                                              \
-            LENGTH2,                                                        \
-            CONST_ALIGN_MAX3,                                               \
-            CONST_ALIGN_NAME3,                                              \
-            LENGTH3,                                                        \
-            ##__VA_ARGS__);                                                 \
-      } else {                                                              \
-        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
-          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
-            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
-            ALIGN_SWITCH_2(                                                 \
-                CONST_ALIGN_MAX2,                                           \
-                CONST_ALIGN_NAME2,                                          \
-                LENGTH2,                                                    \
-                CONST_ALIGN_MAX3,                                           \
-                CONST_ALIGN_NAME3,                                          \
-                LENGTH3,                                                    \
-                ##__VA_ARGS__);                                             \
-          } else {                                                          \
-            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
-              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
-                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
-                    CONST_ALIGN_MAX1 / 4;                                   \
-                ALIGN_SWITCH_2(                                             \
-                    CONST_ALIGN_MAX2,                                       \
-                    CONST_ALIGN_NAME2,                                      \
-                    LENGTH2,                                                \
-                    CONST_ALIGN_MAX3,                                       \
-                    CONST_ALIGN_NAME3,                                      \
-                    LENGTH3,                                                \
-                    ##__VA_ARGS__);                                         \
-              } else {                                                      \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
-                ALIGN_SWITCH_2(                                             \
-                    CONST_ALIGN_MAX2,                                       \
-                    CONST_ALIGN_NAME2,                                      \
-                    LENGTH2,                                                \
-                    CONST_ALIGN_MAX3,                                       \
-                    CONST_ALIGN_NAME3,                                      \
-                    LENGTH3,                                                \
-                    ##__VA_ARGS__);                                         \
-              };                                                            \
-            }                                                               \
-          };                                                                \
-        }                                                                   \
-      };                                                                    \
-    }                                                                       \
-  }()
+#define ALIGN_SWITCH_3(CONST_ALIGN_MAX1,                                                        \
+                       CONST_ALIGN_NAME1,                                                       \
+                       LENGTH1,                                                                 \
+                       CONST_ALIGN_MAX2,                                                        \
+                       CONST_ALIGN_NAME2,                                                       \
+                       LENGTH2,                                                                 \
+                       CONST_ALIGN_MAX3,                                                        \
+                       CONST_ALIGN_NAME3,                                                       \
+                       LENGTH3,                                                                 \
+                       ...)                                                                     \
+    [&] {                                                                                       \
+        if constexpr(CONST_ALIGN_MAX1 > 0)                                                      \
+        {                                                                                       \
+            if(LENGTH1 % CONST_ALIGN_MAX1 == 0)                                                 \
+            {                                                                                   \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;                     \
+                ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                                \
+                               CONST_ALIGN_NAME2,                                               \
+                               LENGTH2,                                                         \
+                               CONST_ALIGN_MAX3,                                                \
+                               CONST_ALIGN_NAME3,                                               \
+                               LENGTH3,                                                         \
+                               ##__VA_ARGS__);                                                  \
+            }                                                                                   \
+            else                                                                                \
+            {                                                                                   \
+                if constexpr(CONST_ALIGN_MAX1 / 2 > 0)                                          \
+                {                                                                               \
+                    if(LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0)                                   \
+                    {                                                                           \
+                        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;         \
+                        ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                        \
+                                       CONST_ALIGN_NAME2,                                       \
+                                       LENGTH2,                                                 \
+                                       CONST_ALIGN_MAX3,                                        \
+                                       CONST_ALIGN_NAME3,                                       \
+                                       LENGTH3,                                                 \
+                                       ##__VA_ARGS__);                                          \
+                    }                                                                           \
+                    else                                                                        \
+                    {                                                                           \
+                        if constexpr(CONST_ALIGN_MAX1 / 4 > 0)                                  \
+                        {                                                                       \
+                            if(LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0)                           \
+                            {                                                                   \
+                                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 4; \
+                                ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                \
+                                               CONST_ALIGN_NAME2,                               \
+                                               LENGTH2,                                         \
+                                               CONST_ALIGN_MAX3,                                \
+                                               CONST_ALIGN_NAME3,                               \
+                                               LENGTH3,                                         \
+                                               ##__VA_ARGS__);                                  \
+                            }                                                                   \
+                            else                                                                \
+                            {                                                                   \
+                                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                    \
+                                ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                \
+                                               CONST_ALIGN_NAME2,                               \
+                                               LENGTH2,                                         \
+                                               CONST_ALIGN_MAX3,                                \
+                                               CONST_ALIGN_NAME3,                               \
+                                               LENGTH3,                                         \
+                                               ##__VA_ARGS__);                                  \
+                            };                                                                  \
+                        }                                                                       \
+                    };                                                                          \
+                }                                                                               \
+            };                                                                                  \
+        }                                                                                       \
+    }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index eaf8f0bc5..7b39a2c54 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <ck/host_utility/kernel_launch_hip.hpp>
@@ -9,368 +15,387 @@
 
 namespace ck {
 template <>
-__device__ void inner_product<bhalf_t, bhalf_t, float>(
-    const bhalf_t& a,
-    const bhalf_t& b,
-    float& c) {
-  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+__device__ void inner_product<bhalf_t, bhalf_t, float>(const bhalf_t& a, const bhalf_t& b, float& c)
+{
+    inner_product(type_convert<float>(a), type_convert<float>(b), c);
 }
 
 template <>
-__device__ void inner_product<half_t, half_t, float>(
-    const half_t& a,
-    const half_t& b,
-    float& c) {
-  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+__device__ void inner_product<half_t, half_t, float>(const half_t& a, const half_t& b, float& c)
+{
+    inner_product(type_convert<float>(a), type_convert<float>(b), c);
 }
 
 template <>
-__device__ void inner_product<bhalf2_t, bhalf2_t, float>(
-    const bhalf2_t& a,
-    const bhalf2_t& b,
-    float& c) {
-  const vector_type<bhalf_t, 2> a_vector{a};
-  const vector_type<bhalf_t, 2> b_vector{b};
-  ck::static_for<0, 2, 1>{}([&](auto i) {
-    inner_product(
-        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-  });
+__device__ void
+inner_product<bhalf2_t, bhalf2_t, float>(const bhalf2_t& a, const bhalf2_t& b, float& c)
+{
+    const vector_type<bhalf_t, 2> a_vector{a};
+    const vector_type<bhalf_t, 2> b_vector{b};
+    ck::static_for<0, 2, 1>{}([&](auto i) {
+        inner_product(a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+    });
 }
 
 template <>
-__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
-    const bhalf4_t& a,
-    const bhalf4_t& b,
-    float& c) {
-  const vector_type<bhalf_t, 4> a_vector{a};
-  const vector_type<bhalf_t, 4> b_vector{b};
-  ck::static_for<0, 4, 1>{}([&](auto i) {
-    inner_product(
-        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-  });
+__device__ void
+inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, const bhalf4_t& b, float& c)
+{
+    const vector_type<bhalf_t, 4> a_vector{a};
+    const vector_type<bhalf_t, 4> b_vector{b};
+    ck::static_for<0, 4, 1>{}([&](auto i) {
+        inner_product(a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+    });
 }
 } // namespace ck
 
 namespace {
 
 template <typename data_t, int32_t vec_size>
-__device__ typename ck::vector_type<float, vec_size>::type scalar_scale_acc(
-    typename ck::vector_type<float, vec_size>::type acc,
-    typename ck::vector_type<data_t, vec_size>::type a,
-    float b) {
-  union {
-    decltype(acc) vec;
-    float arr[vec_size];
-  } acc_u{acc};
-  union {
-    decltype(a) vec;
-    data_t arr[vec_size];
-  } a_u{a};
+__device__ typename ck::vector_type<float, vec_size>::type
+scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc,
+                 typename ck::vector_type<data_t, vec_size>::type a,
+                 float b)
+{
+    union
+    {
+        decltype(acc) vec;
+        float arr[vec_size];
+    } acc_u{acc};
+    union
+    {
+        decltype(a) vec;
+        data_t arr[vec_size];
+    } a_u{a};
 
 #pragma unroll
-  for (int32_t i = 0; i < vec_size; ++i) {
-    acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
-  }
+    for(int32_t i = 0; i < vec_size; ++i)
+    {
+        acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
+    }
 
-  return acc_u.vec;
+    return acc_u.vec;
 }
 
 template <typename F, int32_t n_threads_per_wavefront = 64>
-float __device__ __forceinline__ wavefrontReduce(float val, F f) {
+float __device__ __forceinline__ wavefrontReduce(float val, F f)
+{
 #pragma unroll
-  for (int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1) {
-    val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
-  }
-  return val;
+    for(int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1)
+    {
+        val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
+    }
+    return val;
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void load_v(
-    const TData* __restrict__ data_ptr,
-    int32_t vector_offset,
-    TDataVec* __restrict__ load_to) {
-  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+__forceinline__ __device__ void
+load_v(const TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec* __restrict__ load_to)
+{
+    *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void store_v(
-    TData* __restrict__ data_ptr,
-    int32_t vector_offset,
-    TDataVec value) {
-  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+__forceinline__ __device__ void
+store_v(TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec value)
+{
+    *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
-template <
-    typename scalar_t,
-    int32_t vec_size = 4,
-    int32_t n_loop_unroll = 16,
-    int32_t n_loop_unroll_tail = 2,
-    int32_t T_MAX = 8192,
-    int32_t n_wavefronts_per_block = 16>
-__global__ void efficient_attention_forward_decoder_ck_kernel(
-    const scalar_t* __restrict__ XQ,
-    const scalar_t* __restrict__ cache_K,
-    const scalar_t* __restrict__ cache_V,
-    scalar_t* __restrict__ O,
-    const int32_t* __restrict__ seq_kv_lens,
-    const ptrdiff_t XQ_stride_0,
-    const ptrdiff_t XQ_stride_1,
-    const ptrdiff_t XQ_stride_2,
-    const ptrdiff_t K_stride_0,
-    const ptrdiff_t K_stride_1,
-    const ptrdiff_t K_stride_2,
-    const int32_t K_size_1,
-    const int32_t D_H,
-    const bool multiquery,
-    const float qk_scale) {
-  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
-
-  // Each block handles a single batch and head and query
-  const int32_t b = blockIdx.x;
-  const int32_t h = blockIdx.y;
-  const int32_t m = blockIdx.z;
-
-  // Note: this is decoding case where we attend to current and all previous
-  // tokens.
-  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_1;
-
-  const int32_t lane_idx = threadIdx.x;
-  const int32_t wavefront_idx = threadIdx.y;
-  const int32_t threads_per_wavefront = blockDim.x;
-  const int32_t wavefronts_per_block = blockDim.y;
-  const int32_t threads_per_block =
-      threads_per_wavefront * wavefronts_per_block;
-  const int32_t thread_linear_idx =
-      lane_idx + wavefront_idx * threads_per_wavefront;
-  // const auto* q_ = &(XQ_acc[b][m][h][0]);
-  const auto XQO_base_offset =
-      b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
-  const auto* __restrict__ q_ = XQ + XQO_base_offset;
-
-  const auto cache_KV_base_offset =
-      b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
-  const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
-  const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
-
-  // Load Q into registers in all wavefronts.
-  // Each thread handles `vec_size` D dimensions
-
-  using data_t = scalar_t;
-  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
-  using compute_t = float;
-  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
-
-  const bool lane_active_for_io = lane_idx * vec_size < D_H;
-
-  extern __shared__ __align__(16) compute_t smem[];
-
-  data_vec_t q_thread = 0;
-  if (lane_active_for_io) {
-    load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
-  }
-  // Each block computes different B value
-  compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
-
-  // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-  // Split T across wavefronts in a block, unroll loads to expose more
-  // parallelism.
-
-  data_vec_t k_loads[n_loop_unroll] = {};
-
-  constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
-  const int32_t t_max_unroll = (t_max / dtt) * dtt;
-
-  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
-    if (lane_active_for_io) {
-#pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        const int32_t t = tt + ttt;
-        // load the K[b][t][h|0][:] row into registers
-        load_v<data_t, data_vec_t>(
-            cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-      }
+template <typename scalar_t,
+          int32_t vec_size               = 4,
+          int32_t n_loop_unroll          = 16,
+          int32_t n_loop_unroll_tail     = 2,
+          int32_t T_MAX                  = 8192,
+          int32_t n_wavefronts_per_block = 16>
+__global__ void
+efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
+                                              const scalar_t* __restrict__ cache_K,
+                                              const scalar_t* __restrict__ cache_V,
+                                              scalar_t* __restrict__ O,
+                                              const int32_t* __restrict__ seq_kv_lens,
+                                              const ptrdiff_t XQ_stride_0,
+                                              const ptrdiff_t XQ_stride_1,
+                                              const ptrdiff_t XQ_stride_2,
+                                              const ptrdiff_t K_stride_0,
+                                              const ptrdiff_t K_stride_1,
+                                              const ptrdiff_t K_stride_2,
+                                              const int32_t K_size_1,
+                                              const int32_t D_H,
+                                              const bool multiquery,
+                                              const float qk_scale)
+{
+    static_assert(n_loop_unroll_tail < n_loop_unroll, "");
+
+    // Each block handles a single batch and head and query
+    const int32_t b = blockIdx.x;
+    const int32_t h = blockIdx.y;
+    const int32_t m = blockIdx.z;
+
+    // Note: this is decoding case where we attend to current and all previous
+    // tokens.
+    const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_1;
+
+    const int32_t lane_idx              = threadIdx.x;
+    const int32_t wavefront_idx         = threadIdx.y;
+    const int32_t threads_per_wavefront = blockDim.x;
+    const int32_t wavefronts_per_block  = blockDim.y;
+    const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
+    const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
+    // const auto* q_ = &(XQ_acc[b][m][h][0]);
+    const auto XQO_base_offset  = b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
+    const auto* __restrict__ q_ = XQ + XQO_base_offset;
+
+    const auto cache_KV_base_offset       = b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+    const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
+    const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
+
+    // Load Q into registers in all wavefronts.
+    // Each thread handles `vec_size` D dimensions
+
+    using data_t        = scalar_t;
+    using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
+    using compute_t     = float;
+    using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+    const bool lane_active_for_io = lane_idx * vec_size < D_H;
+
+    extern __shared__ __align__(16) compute_t smem[];
+
+    data_vec_t q_thread = 0;
+    if(lane_active_for_io)
+    {
+        load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
     }
-    compute_t qk_accs[n_loop_unroll] = {};
+    // Each block computes different B value
+    compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+
+    // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+    // Split T across wavefronts in a block, unroll loads to expose more
+    // parallelism.
+
+    data_vec_t k_loads[n_loop_unroll] = {};
+
+    constexpr auto dtt         = n_wavefronts_per_block * n_loop_unroll;
+    const int32_t t_max_unroll = (t_max / dtt) * dtt;
+
+    for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
+    {
+        if(lane_active_for_io)
+        {
 #pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-          q_thread, k_loads[ttt], qk_accs[ttt]);
-      qk_accs[ttt] *= qk_scale;
-
-      qk_accs[ttt] =
-          wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
-      max_qk_acc = ck::math::max(qk_accs[ttt], max_qk_acc);
-    }
-    if (lane_idx == 0) {
-      auto* __restrict__ smem_base = smem + tt;
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                // load the K[b][t][h|0][:] row into registers
+                load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+            }
+        }
+        compute_t qk_accs[n_loop_unroll] = {};
+#pragma unroll n_loop_unroll
+        for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+        {
+            ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+                q_thread, k_loads[ttt], qk_accs[ttt]);
+            qk_accs[ttt] *= qk_scale;
+
+            qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
+            max_qk_acc   = ck::math::max(qk_accs[ttt], max_qk_acc);
+        }
+        if(lane_idx == 0)
+        {
+            auto* __restrict__ smem_base = smem + tt;
 #pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        smem_base[ttt] = qk_accs[ttt];
-      }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                smem_base[ttt] = qk_accs[ttt];
+            }
+        }
     }
-  }
 
-  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-       tt += wavefronts_per_block * n_loop_unroll_tail) {
-    if (lane_active_for_io) {
+    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+    for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+        tt += wavefronts_per_block * n_loop_unroll_tail)
+    {
+        if(lane_active_for_io)
+        {
 #pragma unroll n_loop_unroll_tail
-      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-        const int32_t t = tt + ttt;
-        if (t < t_max) {
-          // load the K[b][t][h|0][:] row into registers
-          load_v<data_t, data_vec_t>(
-              cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
+                {
+                    // load the K[b][t][h|0][:] row into registers
+                    load_v<data_t, data_vec_t>(
+                        cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+                }
+            }
         }
-      }
-    }
 #pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      compute_t qk_acc = 0;
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-            q_thread, k_loads[ttt], qk_acc);
-        qk_acc *= qk_scale;
-
-        qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-        max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
-
-        // write accumulated sums to smem.
-        if (lane_idx == 0) {
-          smem[t] = qk_acc;
+        for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+        {
+            compute_t qk_acc = 0;
+            const int32_t t  = tt + ttt;
+            if(t < t_max)
+            {
+                ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+                    q_thread, k_loads[ttt], qk_acc);
+                qk_acc *= qk_scale;
+
+                qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+                max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
+
+                // write accumulated sums to smem.
+                if(lane_idx == 0)
+                {
+                    smem[t] = qk_acc;
+                }
+            }
         }
-      }
     }
-  }
-
-  // Use shared reduction to compute max and compute softmax on shared memory.
-  // write max acc
-  if (lane_idx == 0) {
-    smem[T_MAX + wavefront_idx] = max_qk_acc;
-  }
-  __syncthreads();
-  if (lane_idx < wavefronts_per_block) {
-    max_qk_acc = ck::math::max(max_qk_acc, smem[T_MAX + lane_idx]);
-  }
-  // shared across all threads in block
-  max_qk_acc =
-      wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
-
-  // each wavefront computes partial sum of exp.
-  compute_t softmax_denominator = 0.0f;
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
-  }
-  softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](auto a, auto b) { return a + b; });
-
-  if (lane_idx == 0) {
-    smem[T_MAX + wavefront_idx] = softmax_denominator;
-  }
-  __syncthreads();
-
-  // now, compute sum of exp(x - max(x)) over all intermediate results.
-  softmax_denominator = 0.0;
-  if (lane_idx < wavefronts_per_block) {
-    softmax_denominator = smem[T_MAX + lane_idx];
-  }
-  softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](auto a, auto b) { return a + b; });
-
-  const compute_t softmax_scale_factor = 1. / softmax_denominator;
-  // now, compute the normalization across all threads.
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
-  }
-  __syncthreads();
-
-  // Split T across wavefronts in a block
-  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-  // outputs are of size float[D]
-
-  compute_t ps[n_loop_unroll] = {};
-  compute_vec_t o_acc = 0;
-  if (lane_active_for_io) {
-    for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll;
-         tt += dtt) {
+
+    // Use shared reduction to compute max and compute softmax on shared memory.
+    // write max acc
+    if(lane_idx == 0)
+    {
+        smem[T_MAX + wavefront_idx] = max_qk_acc;
+    }
+    __syncthreads();
+    if(lane_idx < wavefronts_per_block)
+    {
+        max_qk_acc = ck::math::max(max_qk_acc, smem[T_MAX + lane_idx]);
+    }
+    // shared across all threads in block
+    max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+
+    // each wavefront computes partial sum of exp.
+    compute_t softmax_denominator = 0.0f;
+    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    {
+        softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+    }
+    softmax_denominator =
+        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+
+    if(lane_idx == 0)
+    {
+        smem[T_MAX + wavefront_idx] = softmax_denominator;
+    }
+    __syncthreads();
+
+    // now, compute sum of exp(x - max(x)) over all intermediate results.
+    softmax_denominator = 0.0;
+    if(lane_idx < wavefronts_per_block)
+    {
+        softmax_denominator = smem[T_MAX + lane_idx];
+    }
+    softmax_denominator =
+        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+
+    const compute_t softmax_scale_factor = 1. / softmax_denominator;
+    // now, compute the normalization across all threads.
+    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    {
+        smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+    }
+    __syncthreads();
+
+    // Split T across wavefronts in a block
+    // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+    // outputs are of size float[D]
+
+    compute_t ps[n_loop_unroll] = {};
+    compute_vec_t o_acc         = 0;
+    if(lane_active_for_io)
+    {
+        for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
+        {
 #pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        const int32_t t = tt + ttt;
-        // load the V[b][t][h|0][:] row into registers, reusing K register
-        // storage
-        load_v<data_t, data_vec_t>(
-            cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-        ps[ttt] = smem[t];
-      }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                // load the V[b][t][h|0][:] row into registers, reusing K register
+                // storage
+                load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+                ps[ttt] = smem[t];
+            }
 
 #pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        o_acc =
-            scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-      }
-    }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+            }
+        }
 
-    for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail;
-         tt < t_max;
-         tt += wavefronts_per_block * n_loop_unroll_tail) {
+        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+            tt += wavefronts_per_block * n_loop_unroll_tail)
+        {
 #pragma unroll n_loop_unroll_tail
-      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-        const int32_t t = tt + ttt;
-        if (t < t_max) {
-          // load the V[b][t][h|0][:] row into registers, reusing K register
-          // storage
-          load_v<data_t, data_vec_t>(
-              cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-          ps[ttt] = smem[t];
-        }
-      }
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
+                {
+                    // load the V[b][t][h|0][:] row into registers, reusing K register
+                    // storage
+                    load_v<data_t, data_vec_t>(
+                        cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+                    ps[ttt] = smem[t];
+                }
+            }
 
 #pragma unroll n_loop_unroll_tail
-      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-        const int32_t t = tt + ttt;
-        if (t < t_max) {
-          o_acc =
-              scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
+                {
+                    o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+                }
+            }
         }
-      }
     }
-  }
-  // now, each thread has partial sums. Write to smem and get accumulated
-  // results back.
-  __syncthreads();
-
-  // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
-  if (lane_active_for_io) {
-    store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
-  }
-
-  __syncthreads();
-  // sum up partial D rows from other wavefronts
-  if (wavefront_idx == 0 && lane_active_for_io) {
-    union {
-      compute_vec_t vec = 0;
-      compute_t arr[vec_size];
-    } r;
-    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      compute_vec_t partial_r;
-      load_v<compute_t, compute_vec_t>(
-          smem, w * threads_per_wavefront + lane_idx, &partial_r);
-      r.vec += partial_r;
+    // now, each thread has partial sums. Write to smem and get accumulated
+    // results back.
+    __syncthreads();
+
+    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+    if(lane_active_for_io)
+    {
+        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
     }
-    // elementwise convert from compute_t result to data_t out to be written
-    union {
-      data_vec_t vec;
-      data_t arr[vec_size];
-    } bf_r;
+
+    __syncthreads();
+    // sum up partial D rows from other wavefronts
+    if(wavefront_idx == 0 && lane_active_for_io)
+    {
+        union
+        {
+            compute_vec_t vec = 0;
+            compute_t arr[vec_size];
+        } r;
+        for(int32_t w = 0; w < wavefronts_per_block; ++w)
+        {
+            compute_vec_t partial_r;
+            load_v<compute_t, compute_vec_t>(
+                smem, w * threads_per_wavefront + lane_idx, &partial_r);
+            r.vec += partial_r;
+        }
+        // elementwise convert from compute_t result to data_t out to be written
+        union
+        {
+            data_vec_t vec;
+            data_t arr[vec_size];
+        } bf_r;
 #pragma unroll
-    for (int32_t i = 0; i < vec_size; ++i) {
-      bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+        for(int32_t i = 0; i < vec_size; ++i)
+        {
+            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+        }
+        // write output row O[b][m][h][:]
+        data_t* __restrict__ o_ = O + XQO_base_offset;
+        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
     }
-    // write output row O[b][m][h][:]
-    data_t* __restrict__ o_ = O + XQO_base_offset;
-    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
-  }
 }
 
 } // namespace
@@ -379,121 +404,128 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 template <typename scalar_t>
-struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
-  using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-  struct Argument : public BaseArgument {
-    const scalar_t* __restrict__ XQ;
-    const scalar_t* __restrict__ cache_K;
-    const scalar_t* __restrict__ cache_V;
-    scalar_t* __restrict__ O;
-    const int32_t* __restrict__ seq_kv_lens;
-    const ptrdiff_t XQ_stride_0;
-    const ptrdiff_t XQ_stride_1;
-    const ptrdiff_t XQ_stride_2;
-    const ptrdiff_t K_stride_0;
-    const ptrdiff_t K_stride_1;
-    const ptrdiff_t K_stride_2;
-    const int32_t K_size_1;
-    const int32_t D_H;
-    const bool multiquery;
-    const float qk_scale;
-
-    const dim3 grid_dim;
-    const dim3 block_dim;
-    const size_t lds_bytes;
-
-    Argument(
-        const scalar_t* __restrict__ XQ,
-        const scalar_t* __restrict__ cache_K,
-        const scalar_t* __restrict__ cache_V,
-        scalar_t* __restrict__ O,
-        const int32_t* __restrict__ seq_kv_lens,
-        const ptrdiff_t XQ_stride_0,
-        const ptrdiff_t XQ_stride_1,
-        const ptrdiff_t XQ_stride_2,
-        const ptrdiff_t K_stride_0,
-        const ptrdiff_t K_stride_1,
-        const ptrdiff_t K_stride_2,
-        const int32_t K_size_1,
-        const int32_t D_H,
-        const bool multiquery,
-        const float qk_scale,
-        const dim3 grid_dim,
-        const dim3 block_dim,
-        const size_t lds_bytes)
-        : XQ(XQ),
-          cache_K(cache_K),
-          cache_V(cache_V),
-          O(O),
-          seq_kv_lens(seq_kv_lens),
-          XQ_stride_0(XQ_stride_0),
-          XQ_stride_1(XQ_stride_1),
-          XQ_stride_2(XQ_stride_2),
-          K_stride_0(K_stride_0),
-          K_stride_1(K_stride_1),
-          K_stride_2(K_stride_2),
-          K_size_1(K_size_1),
-          D_H(D_H),
-          multiquery(multiquery),
-          qk_scale(qk_scale),
-          grid_dim(grid_dim),
-          block_dim(block_dim),
-          lds_bytes(lds_bytes) {}
-  };
-
-  struct Invoker : public BaseInvoker {
-    using Argument = DeviceOp::Argument;
-    float Run(
-        const Argument& arg,
-        const StreamConfig& stream_config = StreamConfig{}) {
-      auto threads_per_wavefront = arg.block_dim.x;
-
-      auto D_H_alignment_necessary = 0;
-
-      for (auto vec_size : {4, 2, 1}) {
-        if (arg.D_H <= vec_size * threads_per_wavefront) {
-          D_H_alignment_necessary = vec_size;
+struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
+{
+    using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+    struct Argument : public BaseArgument
+    {
+        const scalar_t* __restrict__ XQ;
+        const scalar_t* __restrict__ cache_K;
+        const scalar_t* __restrict__ cache_V;
+        scalar_t* __restrict__ O;
+        const int32_t* __restrict__ seq_kv_lens;
+        const ptrdiff_t XQ_stride_0;
+        const ptrdiff_t XQ_stride_1;
+        const ptrdiff_t XQ_stride_2;
+        const ptrdiff_t K_stride_0;
+        const ptrdiff_t K_stride_1;
+        const ptrdiff_t K_stride_2;
+        const int32_t K_size_1;
+        const int32_t D_H;
+        const bool multiquery;
+        const float qk_scale;
+
+        const dim3 grid_dim;
+        const dim3 block_dim;
+        const size_t lds_bytes;
+
+        Argument(const scalar_t* __restrict__ XQ,
+                 const scalar_t* __restrict__ cache_K,
+                 const scalar_t* __restrict__ cache_V,
+                 scalar_t* __restrict__ O,
+                 const int32_t* __restrict__ seq_kv_lens,
+                 const ptrdiff_t XQ_stride_0,
+                 const ptrdiff_t XQ_stride_1,
+                 const ptrdiff_t XQ_stride_2,
+                 const ptrdiff_t K_stride_0,
+                 const ptrdiff_t K_stride_1,
+                 const ptrdiff_t K_stride_2,
+                 const int32_t K_size_1,
+                 const int32_t D_H,
+                 const bool multiquery,
+                 const float qk_scale,
+                 const dim3 grid_dim,
+                 const dim3 block_dim,
+                 const size_t lds_bytes)
+            : XQ(XQ),
+              cache_K(cache_K),
+              cache_V(cache_V),
+              O(O),
+              seq_kv_lens(seq_kv_lens),
+              XQ_stride_0(XQ_stride_0),
+              XQ_stride_1(XQ_stride_1),
+              XQ_stride_2(XQ_stride_2),
+              K_stride_0(K_stride_0),
+              K_stride_1(K_stride_1),
+              K_stride_2(K_stride_2),
+              K_size_1(K_size_1),
+              D_H(D_H),
+              multiquery(multiquery),
+              qk_scale(qk_scale),
+              grid_dim(grid_dim),
+              block_dim(block_dim),
+              lds_bytes(lds_bytes)
+        {
         }
-      }
-
-      if (!D_H_alignment_necessary) {
-        throw std::runtime_error("Unsupported D_H");
-      }
-
-      if (arg.D_H % D_H_alignment_necessary) {
-        throw std::runtime_error("Unsupported alignment for D_H");
-      }
-
-      return launch_and_time_kernel(
-          stream_config,
-          D_H_alignment_necessary == 4
-              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-              : D_H_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-              : D_H_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-              : nullptr,
-          arg.grid_dim,
-          arg.block_dim,
-          arg.lds_bytes,
-          arg.XQ,
-          arg.cache_K,
-          arg.cache_V,
-          arg.O,
-          arg.seq_kv_lens,
-          arg.XQ_stride_0,
-          arg.XQ_stride_1,
-          arg.XQ_stride_2,
-          arg.K_stride_0,
-          arg.K_stride_1,
-          arg.K_stride_2,
-          arg.K_size_1,
-          arg.D_H,
-          arg.multiquery,
-          arg.qk_scale);
-    }
-  };
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            auto threads_per_wavefront = arg.block_dim.x;
+
+            auto D_H_alignment_necessary = 0;
+
+            for(auto vec_size : {4, 2, 1})
+            {
+                if(arg.D_H <= vec_size * threads_per_wavefront)
+                {
+                    D_H_alignment_necessary = vec_size;
+                }
+            }
+
+            if(!D_H_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported D_H");
+            }
+
+            if(arg.D_H % D_H_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported alignment for D_H");
+            }
+
+            return launch_and_time_kernel(
+                stream_config,
+                D_H_alignment_necessary == 4
+                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+                    : D_H_alignment_necessary == 2
+                          ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                          : D_H_alignment_necessary == 1
+                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+                                : nullptr,
+                arg.grid_dim,
+                arg.block_dim,
+                arg.lds_bytes,
+                arg.XQ,
+                arg.cache_K,
+                arg.cache_V,
+                arg.O,
+                arg.seq_kv_lens,
+                arg.XQ_stride_0,
+                arg.XQ_stride_1,
+                arg.XQ_stride_2,
+                arg.K_stride_0,
+                arg.K_stride_1,
+                arg.K_stride_2,
+                arg.K_size_1,
+                arg.D_H,
+                arg.multiquery,
+                arg.qk_scale);
+        }
+    };
 };
 } // namespace device
 } // namespace tensor_operation
-} // namespace ck
\ No newline at end of file
+} // namespace ck
diff --git a/xformers/csrc/attention/hip_fmha/ck_bool_switch.h b/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
index 4e447a143..4b92dd95a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
@@ -1,23 +1,35 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
-#define BOOL_SWITCH_1(COND1, CONST_NAME1, ...) \
-  [&] {                                        \
-    if (COND1) {                               \
-      constexpr bool CONST_NAME1 = true;       \
-      __VA_ARGS__();                           \
-    } else {                                   \
-      constexpr bool CONST_NAME1 = false;      \
-      __VA_ARGS__();                           \
-    }                                          \
-  }()
+#define BOOL_SWITCH_1(COND1, CONST_NAME1, ...)  \
+    [&] {                                       \
+        if(COND1)                               \
+        {                                       \
+            constexpr bool CONST_NAME1 = true;  \
+            __VA_ARGS__();                      \
+        }                                       \
+        else                                    \
+        {                                       \
+            constexpr bool CONST_NAME1 = false; \
+            __VA_ARGS__();                      \
+        }                                       \
+    }()
 
 #define BOOL_SWITCH_2(COND1, CONST_NAME1, COND2, CONST_NAME2, ...) \
-  [&] {                                                            \
-    if (COND1) {                                                   \
-      constexpr bool CONST_NAME1 = true;                           \
-      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);             \
-    } else {                                                       \
-      constexpr bool CONST_NAME1 = false;                          \
-      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);             \
-    }                                                              \
-  }()
+    [&] {                                                          \
+        if(COND1)                                                  \
+        {                                                          \
+            constexpr bool CONST_NAME1 = true;                     \
+            BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);      \
+        }                                                          \
+        else                                                       \
+        {                                                          \
+            constexpr bool CONST_NAME1 = false;                    \
+            BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);      \
+        }                                                          \
+    }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
index d80ffa43b..b7de4dbf8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <ck/ck.hpp>
@@ -5,186 +11,190 @@
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedBackward_V1 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  // static constexpr ck::index_t KPerBlock;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 4;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsBatchedBackward_V1
+{
+    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+    static constexpr ck::index_t BlockSize             = 256;
+    static constexpr ck::index_t MPerBlock             = 128;
+    static constexpr ck::index_t NPerBlock             = 128;
+    // static constexpr ck::index_t KPerBlock;
+    // static constexpr ck::index_t Gemm1NPerBlock;
+    static constexpr ck::index_t Gemm1KPerBlock = 32;
+    static constexpr ck::index_t Gemm2KPerBlock = 32;
+    static constexpr ck::index_t AK1            = 8;
+    static constexpr ck::index_t BK1            = 8;
+    static constexpr ck::index_t B1K1           = 2;
+    static constexpr ck::index_t MPerXDL        = 32;
+    static constexpr ck::index_t NPerXDL        = 32;
+    static constexpr ck::index_t MXdlPerWave    = 4;
+    static constexpr ck::index_t NXdlPerWave    = 1;
+    // static constexpr ck::index_t Gemm1NXdlPerWave;
+    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
+    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
+    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
+    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
+    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+    static constexpr bool ABlockLdsExtraM                             = true;
+    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
+    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
+    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
+    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
+    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+    static constexpr bool BBlockLdsExtraN                             = true;
+    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+    // using
+    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+    // static constexpr ck::index_t
+    // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedBackward_V2 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 64;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 128;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 64;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 2;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsBatchedBackward_V2
+{
+    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+    static constexpr ck::index_t BlockSize             = 256;
+    static constexpr ck::index_t MPerBlock             = 64;
+    static constexpr ck::index_t NPerBlock             = 128;
+    static constexpr ck::index_t KPerBlock             = 128;
+    // static constexpr ck::index_t Gemm1NPerBlock;
+    static constexpr ck::index_t Gemm1KPerBlock = 32;
+    static constexpr ck::index_t Gemm2KPerBlock = 64;
+    static constexpr ck::index_t AK1            = 8;
+    static constexpr ck::index_t BK1            = 8;
+    static constexpr ck::index_t B1K1           = 2;
+    static constexpr ck::index_t MPerXDL        = 32;
+    static constexpr ck::index_t NPerXDL        = 32;
+    static constexpr ck::index_t MXdlPerWave    = 2;
+    static constexpr ck::index_t NXdlPerWave    = 1;
+    // static constexpr ck::index_t Gemm1NXdlPerWave;
+    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
+    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
+    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
+    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
+    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+    static constexpr bool ABlockLdsExtraM                             = true;
+    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
+    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
+    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
+    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
+    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+    static constexpr bool BBlockLdsExtraN                             = true;
+    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+    using B1BlockTransferThreadClusterLengths_BK0_N_BK1      = S<8, 32, 1>;
+    using B1BlockTransferThreadClusterArrangeOrder           = S<0, 2, 1>;
+    using B1BlockTransferSrcAccessOrder                      = S<0, 2, 1>;
+    static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+    // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+    static constexpr bool B1BlockLdsExtraN                             = false;
+    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle         = 1;
+    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+    // using
+    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+    // static constexpr ck::index_t
+    // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedBackward_V1 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  // static constexpr ck::index_t KPerBlock;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 4;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsGroupedBackward_V1
+{
+    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+    static constexpr ck::index_t BlockSize             = 256;
+    static constexpr ck::index_t MPerBlock             = 128;
+    static constexpr ck::index_t NPerBlock             = 128;
+    // static constexpr ck::index_t KPerBlock;
+    // static constexpr ck::index_t Gemm1NPerBlock;
+    static constexpr ck::index_t Gemm1KPerBlock = 32;
+    static constexpr ck::index_t Gemm2KPerBlock = 32;
+    static constexpr ck::index_t AK1            = 8;
+    static constexpr ck::index_t BK1            = 8;
+    static constexpr ck::index_t B1K1           = 2;
+    static constexpr ck::index_t MPerXDL        = 32;
+    static constexpr ck::index_t NPerXDL        = 32;
+    static constexpr ck::index_t MXdlPerWave    = 4;
+    static constexpr ck::index_t NXdlPerWave    = 1;
+    // static constexpr ck::index_t Gemm1NXdlPerWave;
+    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
+    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
+    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
+    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
+    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+    static constexpr bool ABlockLdsExtraM                             = true;
+    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
+    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
+    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
+    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
+    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+    static constexpr bool BBlockLdsExtraN                             = true;
+    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+    // using
+    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+    // static constexpr ck::index_t
+    // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedBackward_V2 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 64;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 128;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 64;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 2;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsGroupedBackward_V2
+{
+    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+    static constexpr ck::index_t BlockSize             = 256;
+    static constexpr ck::index_t MPerBlock             = 64;
+    static constexpr ck::index_t NPerBlock             = 128;
+    static constexpr ck::index_t KPerBlock             = 128;
+    // static constexpr ck::index_t Gemm1NPerBlock;
+    static constexpr ck::index_t Gemm1KPerBlock = 32;
+    static constexpr ck::index_t Gemm2KPerBlock = 64;
+    static constexpr ck::index_t AK1            = 8;
+    static constexpr ck::index_t BK1            = 8;
+    static constexpr ck::index_t B1K1           = 2;
+    static constexpr ck::index_t MPerXDL        = 32;
+    static constexpr ck::index_t NPerXDL        = 32;
+    static constexpr ck::index_t MXdlPerWave    = 2;
+    static constexpr ck::index_t NXdlPerWave    = 1;
+    // static constexpr ck::index_t Gemm1NXdlPerWave;
+    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
+    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
+    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
+    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
+    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+    static constexpr bool ABlockLdsExtraM                             = true;
+    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
+    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
+    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
+    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
+    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+    static constexpr bool BBlockLdsExtraN                             = true;
+    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+    using B1BlockTransferThreadClusterLengths_BK0_N_BK1      = S<8, 32, 1>;
+    using B1BlockTransferThreadClusterArrangeOrder           = S<0, 2, 1>;
+    using B1BlockTransferSrcAccessOrder                      = S<0, 2, 1>;
+    static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+    // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+    static constexpr bool B1BlockLdsExtraN                             = false;
+    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle         = 1;
+    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+    // using
+    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+    // static constexpr ck::index_t
+    // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 9293d4d4f..3c5fdffc2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <sstream>
@@ -16,60 +22,56 @@
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-struct batched_backward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-  using Scale = ck::tensor_operation::element_wise::Scale;
-
-  using QKVElementOp = PassThrough;
-  using YElementOp = PassThrough;
-
-  using InputDataType = scalar_t;
-  using OutputDataType =
-      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
-  using GemmDataType = scalar_t;
-  using AccDataType = F32;
-  using ShuffleDataType = F32;
-  using LSEDataType = F32;
-  using ZDataType = unsigned short;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr bool Deterministic = true;
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
+struct batched_backward_masktype_attnbias_dispatched
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+
+    using QKVElementOp = PassThrough;
+    using YElementOp   = PassThrough;
+
+    using InputDataType    = scalar_t;
+    using OutputDataType   = typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
+    using GemmDataType     = scalar_t;
+    using AccDataType      = F32;
+    using ShuffleDataType  = F32;
+    using LSEDataType      = F32;
+    using ZDataType        = unsigned short;
+    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
+    using Acc1BiasDataType = void;
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+    static constexpr auto MaskingSpec =
+        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
+
+    static constexpr bool Deterministic = true;
+
+    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_BACKWARD_V1_HEADDIM_SWITCH
-#define BATCHED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                               \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
-      __VA_ARGS__();                                                  \
-    } else {                                                          \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
-      __VA_ARGS__();                                                  \
-    };                                                                \
-  }()
+#define BATCHED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)              \
+    [&] {                                                                          \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                                     \
+        {                                                                          \
+            constexpr ck::index_t kGemm1NPerBlock                = 32;             \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 1;              \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;              \
+            using kCShuffleBlockTransferClusterLengths           = S<1, 64, 1, 4>; \
+            __VA_ARGS__();                                                         \
+        }                                                                          \
+        else                                                                       \
+        {                                                                          \
+            constexpr ck::index_t kGemm1NPerBlock                = 64;             \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 2;              \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;              \
+            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>; \
+            __VA_ARGS__();                                                         \
+        };                                                                         \
+    }()
 #endif
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -140,9 +142,9 @@ struct batched_backward_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-  // clang-format on
+    // clang-format on
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -221,297 +223,276 @@ struct batched_backward_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    if (param.K <= 64 && param.Kv <= 64) {
-      // compile-time constants which don't depend on head-dim switching
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedBackward_V1::AK1 /
-          GemmOpConstantsBatchedBackward_V1::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedBackward_V1::BK1 /
-          GemmOpConstantsBatchedBackward_V1::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      BATCHED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-        constexpr ck::index_t thread_slice_length_cshuflle_n =
-            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-             kGemm1NXdlPerWave) /
-            kCShuffleBlockTransferClusterLengths::At(I3);
-
-        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-            min(2, thread_slice_length_cshuflle_n);
-
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    // clang-format on
+
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    static void Run(BatchedBackwardParams& param, hipStream_t stream)
+    {
+        using ck::math::min;
+
+        if(param.K <= 64 && param.Kv <= 64)
+        {
+            // compile-time constants which don't depend on head-dim switching
+            constexpr ck::index_t thread_slice_length_ak1 =
+                GemmOpConstantsBatchedBackward_V1::AK1 /
+                GemmOpConstantsBatchedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
+                    I2);
+            constexpr ck::index_t thread_slice_length_bk1 =
+                GemmOpConstantsBatchedBackward_V1::BK1 /
+                GemmOpConstantsBatchedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
+                    I2);
+
+            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+                          "and ThreadClusterLengths!");
+
+            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_ak1);
+
+            BATCHED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+                constexpr ck::index_t thread_slice_length_cshuflle_n =
+                    (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                    kCShuffleBlockTransferClusterLengths::At(I3);
+
+                constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                    min(2, thread_slice_length_cshuflle_n);
+
+                ALIGN_SWITCH_2(
+                    kABBlockTransferSrcScalarPerVector_max,
+                    kABBlockTransferSrcScalarPerVector,
+                    param.K,
+                    kCShuffleBlockTransferScalarPerVector_max,
+                    kCShuffleBlockTransferScalarPerVector,
+                    param.Kv,
+                    [&] {
+                        using DeviceOpInstance =
+                            DeviceOpInstanceTemp_V1<kGemm1NPerBlock,
+                                                    kGemm1NXdlPerWave,
+                                                    kCShuffleNXdlPerWavePerShuffle,
+                                                    kCShuffleBlockTransferClusterLengths,
+                                                    kABBlockTransferSrcScalarPerVector,
+                                                    kCShuffleBlockTransferScalarPerVector>;
+
+                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                    });
             });
-      });
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedBackward_V2::AK1 /
-          GemmOpConstantsBatchedBackward_V2::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedBackward_V2::BK1 /
-          GemmOpConstantsBatchedBackward_V2::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedBackward_V2::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          kCShuffleBlockTransferClusterLengths::At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-
-              static_assert(
-                  kB1BlockTransferSrcScalarPerVector > 0,
-                  "kB1BlockTransferSrcScalarPerVector must be positive");
-
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-
-              static_assert(
-                  kB1BlockTransferSrcScalarPerVector > 0,
-                  "kB1BlockTransferSrcScalarPerVector must be positive");
-
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    };
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(
-      BatchedBackwardParams& param,
-      hipStream_t stream) {
-    std::vector<ck::index_t> q_gs_ms_ks_lengths{
-        param.B, param.Hq, param.M, param.K};
-    std::vector<ck::index_t> q_gs_ms_ks_strides{
-        param.q_strides[0],
-        param.q_strides[2],
-        param.q_strides[1],
-        param.q_strides[3]};
-
-    std::vector<ck::index_t> k_gs_ns_ks_lengths{
-        param.B, param.Hkv, param.N, param.K};
-    std::vector<ck::index_t> k_gs_ns_ks_strides{
-        param.k_strides[0],
-        param.k_strides[2],
-        param.k_strides[1],
-        param.k_strides[3]};
-
-    std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {
-        param.B, param.Hq, param.N, param.K};
-    std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
-        param.tmp_grad_k_strides[0],
-        param.tmp_grad_k_strides[2],
-        param.tmp_grad_k_strides[1],
-        param.tmp_grad_k_strides[3]};
-
-    std::vector<ck::index_t> v_gs_os_ns_lengths{
-        param.B, param.Hkv, param.Kv, param.N};
-    std::vector<ck::index_t> v_gs_os_ns_strides{
-        param.v_strides[0],
-        param.v_strides[2],
-        param.v_strides[3],
-        param.v_strides[1]};
-
-    std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {
-        param.B, param.Hq, param.Kv, param.N};
-    std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
-        param.tmp_grad_v_strides[0],
-        param.tmp_grad_v_strides[2],
-        param.tmp_grad_v_strides[3],
-        param.tmp_grad_v_strides[1]};
-
-    std::vector<ck::index_t> y_gs_ms_os_lengths{
-        param.B, param.Hq, param.M, param.Kv};
-    std::vector<ck::index_t> y_gs_ms_os_strides{
-        param.out_strides[0],
-        param.out_strides[2],
-        param.out_strides[1],
-        param.out_strides[3]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-      d_gs_ms_ns_strides = {
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
+        }
+        else
+        {
+            constexpr ck::index_t kGemm1NPerBlock                = 128;
+            constexpr ck::index_t kGemm1NXdlPerWave              = 4;
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>;
+
+            constexpr ck::index_t thread_slice_length_ak1 =
+                GemmOpConstantsBatchedBackward_V2::AK1 /
+                GemmOpConstantsBatchedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
+                    I2);
+            constexpr ck::index_t thread_slice_length_bk1 =
+                GemmOpConstantsBatchedBackward_V2::BK1 /
+                GemmOpConstantsBatchedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
+                    I2);
+
+            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+                          "and ThreadClusterLengths!");
+
+            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_ak1);
+
+            constexpr ck::index_t thread_slice_length_gemm1n =
+                kGemm1NPerBlock / GemmOpConstantsBatchedBackward_V2::
+                                      B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_gemm1n);
+
+            constexpr ck::index_t thread_slice_length_cshuflle_n =
+                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                kCShuffleBlockTransferClusterLengths::At(I3);
+
+            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                min(2, thread_slice_length_cshuflle_n);
+
+            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
+                         kCShuffleBlockTransferScalarPerVector_max)
+            {
+                ALIGN_SWITCH_2(
+                    kABBlockTransferSrcScalarPerVector_max,
+                    kABBlockTransferSrcScalarPerVector,
+                    param.K,
+                    kB1BlockTransferSrcScalarPerVector_max,
+                    kB1BlockTransferSrcScalarPerVector,
+                    param.Kv,
+                    [&] {
+                        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                            min(kB1BlockTransferSrcScalarPerVector,
+                                kCShuffleBlockTransferScalarPerVector_max);
+
+                        static_assert(kB1BlockTransferSrcScalarPerVector > 0,
+                                      "kB1BlockTransferSrcScalarPerVector must be positive");
+
+                        using DeviceOpInstance =
+                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
+                                                    kGemm1NXdlPerWave,
+                                                    kCShuffleNXdlPerWavePerShuffle,
+                                                    kCShuffleBlockTransferClusterLengths,
+                                                    kABBlockTransferSrcScalarPerVector,
+                                                    kB1BlockTransferSrcScalarPerVector,
+                                                    kCShuffleBlockTransferScalarPerVector>;
+
+                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                    });
+            }
+            else
+            {
+                ALIGN_SWITCH_2(
+                    kABBlockTransferSrcScalarPerVector_max,
+                    kABBlockTransferSrcScalarPerVector,
+                    param.K,
+                    kCShuffleBlockTransferScalarPerVector_max,
+                    kCShuffleBlockTransferScalarPerVector,
+                    param.Kv,
+                    [&] {
+                        constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                            min(kCShuffleBlockTransferScalarPerVector,
+                                kB1BlockTransferSrcScalarPerVector_max);
+
+                        static_assert(kB1BlockTransferSrcScalarPerVector > 0,
+                                      "kB1BlockTransferSrcScalarPerVector must be positive");
+
+                        using DeviceOpInstance =
+                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
+                                                    kGemm1NXdlPerWave,
+                                                    kCShuffleNXdlPerWavePerShuffle,
+                                                    kCShuffleBlockTransferClusterLengths,
+                                                    kABBlockTransferSrcScalarPerVector,
+                                                    kB1BlockTransferSrcScalarPerVector,
+                                                    kCShuffleBlockTransferScalarPerVector>;
+
+                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                    });
+            };
+        };
     };
 
-    float alpha = param.scale;
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptr,
-        param.k_ptr,
-        nullptr, // p_z_grid
-        param.v_ptr,
-        param.out_ptr,
-        param.logsumexp_ptr,
-        param.grad_out_ptr,
-        param.grad_q_ptr,
-        param.grad_k_ptr,
-        param.grad_v_ptr,
-        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-        nullptr, // p_acc1_bias
-        param.bias_has_grad ? param.grad_bias_ptr : nullptr,
-        nullptr,
-        q_gs_ms_ks_lengths, // q, dQ should have same shape
-        q_gs_ms_ks_strides,
-        k_gs_ns_ks_lengths, // k, dK should have same shape
-        k_gs_ns_ks_strides,
-        {1, 1, 1, 1}, // z_gs_ms_ns_lengths
-        {0, 0, 0, 0}, // z_gs_ms_ns_strides
-        v_gs_os_ns_lengths, // v, dV should have same shape
-        v_gs_os_ns_strides,
-        y_gs_ms_os_lengths, // y, dY should have same shape
-        y_gs_ms_os_strides,
-        lse_gs_ms_lengths,
-        param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
-        param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
-        param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
-        param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
-        d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides
-        QKVElementOp{},
-        QKVElementOp{},
-        Scale{alpha},
-        QKVElementOp{},
-        YElementOp{},
-        param.dropout_prob,
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
+    template <typename DeviceOpInstance>
+    static void RunWithDeviceOp(BatchedBackwardParams& param, hipStream_t stream)
+    {
+        std::vector<ck::index_t> q_gs_ms_ks_lengths{param.B, param.Hq, param.M, param.K};
+        std::vector<ck::index_t> q_gs_ms_ks_strides{
+            param.q_strides[0], param.q_strides[2], param.q_strides[1], param.q_strides[3]};
+
+        std::vector<ck::index_t> k_gs_ns_ks_lengths{param.B, param.Hkv, param.N, param.K};
+        std::vector<ck::index_t> k_gs_ns_ks_strides{
+            param.k_strides[0], param.k_strides[2], param.k_strides[1], param.k_strides[3]};
+
+        std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {param.B, param.Hq, param.N, param.K};
+        std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {param.tmp_grad_k_strides[0],
+                                                           param.tmp_grad_k_strides[2],
+                                                           param.tmp_grad_k_strides[1],
+                                                           param.tmp_grad_k_strides[3]};
+
+        std::vector<ck::index_t> v_gs_os_ns_lengths{param.B, param.Hkv, param.Kv, param.N};
+        std::vector<ck::index_t> v_gs_os_ns_strides{
+            param.v_strides[0], param.v_strides[2], param.v_strides[3], param.v_strides[1]};
+
+        std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {param.B, param.Hq, param.Kv, param.N};
+        std::vector<ck::index_t> vgrad_gs_os_ns_strides = {param.tmp_grad_v_strides[0],
+                                                           param.tmp_grad_v_strides[2],
+                                                           param.tmp_grad_v_strides[3],
+                                                           param.tmp_grad_v_strides[1]};
+
+        std::vector<ck::index_t> y_gs_ms_os_lengths{param.B, param.Hq, param.M, param.Kv};
+        std::vector<ck::index_t> y_gs_ms_os_strides{
+            param.out_strides[0], param.out_strides[2], param.out_strides[1], param.out_strides[3]};
+
+        std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
+
+        std::vector<ck::index_t> d_gs_ms_ns_lengths;
+        std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+        if constexpr(has_attn_bias)
+        {
+            d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
+            d_gs_ms_ns_strides = {param.attn_bias_strides[0],
+                                  param.attn_bias_strides[1],
+                                  param.attn_bias_strides[2],
+                                  param.attn_bias_strides[3]};
+        }
+        else
+        {
+            d_gs_ms_ns_lengths = {1, 1, 1, 1};
+            d_gs_ms_ns_strides = {0, 0, 0, 0};
+        };
+
+        float alpha = param.scale;
+
+        auto op      = DeviceOpInstance{};
+        auto invoker = op.MakeInvoker();
+
+        auto arg_ptr = op.MakeArgumentPointer(
+            param.q_ptr,
+            param.k_ptr,
+            nullptr, // p_z_grid
+            param.v_ptr,
+            param.out_ptr,
+            param.logsumexp_ptr,
+            param.grad_out_ptr,
+            param.grad_q_ptr,
+            param.grad_k_ptr,
+            param.grad_v_ptr,
+            param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+            nullptr, // p_acc1_bias
+            param.bias_has_grad ? param.grad_bias_ptr : nullptr,
+            nullptr,
+            q_gs_ms_ks_lengths, // q, dQ should have same shape
+            q_gs_ms_ks_strides,
+            k_gs_ns_ks_lengths, // k, dK should have same shape
+            k_gs_ns_ks_strides,
+            {1, 1, 1, 1},       // z_gs_ms_ns_lengths
+            {0, 0, 0, 0},       // z_gs_ms_ns_strides
+            v_gs_os_ns_lengths, // v, dV should have same shape
+            v_gs_os_ns_strides,
+            y_gs_ms_os_lengths, // y, dY should have same shape
+            y_gs_ms_os_strides,
+            lse_gs_ms_lengths,
+            param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
+            param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
+            param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
+            param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
+            d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
+            d_gs_ms_ns_strides,
+            {}, // acc1_biases_gs_ms_os_lengths
+            {}, // acc1_biases_gs_ms_os_strides
+            QKVElementOp{},
+            QKVElementOp{},
+            Scale{alpha},
+            QKVElementOp{},
+            YElementOp{},
+            param.dropout_prob,
+            std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+        if(!op.IsSupportedArgument(arg_ptr.get()))
+        {
+            std::ostringstream ostr;
+            ostr << op.GetTypeString() << " does not support this problem";
+
+            throw std::runtime_error(ostr.str());
+        }
+
+        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+    };
 };
 
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-void run_batched_backward_masktype_attnbias_dispatched(
-    BatchedBackwardParams& param,
-    hipStream_t stream) {
-  batched_backward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias,
-      use_fp32_qkv_grad>::Run(param, stream);
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
+void run_batched_backward_masktype_attnbias_dispatched(BatchedBackwardParams& param,
+                                                       hipStream_t stream)
+{
+    batched_backward_masktype_attnbias_dispatched<scalar_t,
+                                                  custom_mask_type,
+                                                  has_attn_bias,
+                                                  use_fp32_qkv_grad>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 319b039b9..774c3000c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -1,107 +1,74 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_backward.h"
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void
+run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void
+run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void
+run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 1)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 2)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
+void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_2(
+        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
+            if(param.custom_mask_type == 0)
+                run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t,
+                                                                  0,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            else if(param.custom_mask_type == 1)
+                run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t,
+                                                                  1,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            else if(param.custom_mask_type == 2)
+                run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t,
+                                                                  2,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index 2bcf0653d..3ffb86250 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -1,107 +1,71 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_backward.h"
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
 
-void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 1)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 2)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
+void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_2(
+        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
+            if(param.custom_mask_type == 0)
+                run_batched_backward_masktype_attnbias_dispatched<ck::half_t,
+                                                                  0,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            else if(param.custom_mask_type == 1)
+                run_batched_backward_masktype_attnbias_dispatched<ck::half_t,
+                                                                  1,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            else if(param.custom_mask_type == 2)
+                run_batched_backward_masktype_attnbias_dispatched<ck::half_t,
+                                                                  2,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index b6a98b5fc..56dbb6523 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <sstream>
@@ -18,65 +24,68 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_forward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct batched_forward_masktype_attnbias_dispatched
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using GemmDataType     = scalar_t;
+    using ADataType        = scalar_t;
+    using B0DataType       = scalar_t;
+    using B1DataType       = scalar_t;
+    using AccDataType      = F32;
+    using CShuffleDataType = F32;
+    using CDataType        = scalar_t;
+    using ZDataType        = unsigned short;
+    using LSEDataType      = F32;
+    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
+    using Acc1BiasDataType = void;
+
+    static constexpr ck::index_t NumDimG = 2;
+    static constexpr ck::index_t NumDimM = 1;
+    static constexpr ck::index_t NumDimN = 1;
+    static constexpr ck::index_t NumDimK = 1;
+    static constexpr ck::index_t NumDimO = 1;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+    static constexpr auto MaskingSpec =
+        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
+
+    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_FORWARD_HEADDIM_SWITCH
-#define BATCHED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                           \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-      __VA_ARGS__();                                              \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-      __VA_ARGS__();                                              \
-    } else {                                                      \
-      constexpr ck::index_t kGemm1NPerBlock = 128;                \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-      __VA_ARGS__();                                              \
-    }                                                             \
-  }()
+#define BATCHED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)       \
+    [&] {                                                               \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 128; \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+    }()
 #endif
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -155,218 +164,201 @@ struct batched_forward_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           GemmOpConstantsBatchedForward::Acc1BiasTransferSrcScalarPerVector,
           MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsBatchedForward::AK1 /
-        GemmOpConstantsBatchedForward::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsBatchedForward::BK1 /
-        GemmOpConstantsBatchedForward::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedForward::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedForward::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
-    std::vector<ck::index_t> a_gs_ms_ks_lengths{
-        param.B, param.Hq, param.M, param.K};
-    std::vector<ck::index_t> a_gs_ms_ks_strides{
-        param.q_strides[0],
-        param.q_strides[2],
-        param.q_strides[1],
-        param.q_strides[3]};
-
-    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-        param.B, param.Hkv, param.N, param.K};
-    std::vector<ck::index_t> b0_gs_ns_ks_strides{
-        param.k_strides[0],
-        param.k_strides[2],
-        param.k_strides[1],
-        param.k_strides[3]};
-
-    // to be changed to b1_gs_ns_os_lengths
-    std::vector<ck::index_t> b1_gs_os_ns_lengths{
-        param.B, param.Hkv, param.Kv, param.N};
-    std::vector<ck::index_t> b1_gs_os_ns_strides{
-        param.v_strides[0],
-        param.v_strides[2],
-        param.v_strides[3],
-        param.v_strides[1]};
-
-    std::vector<ck::index_t> c_gs_ms_os_lengths{
-        param.B, param.Hq, param.M, param.Kv};
-    std::vector<ck::index_t> c_gs_ms_os_strides{
-        param.out_strides[0],
-        param.out_strides[2],
-        param.out_strides[1],
-        param.out_strides[3]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-      d_gs_ms_ns_strides = {
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    // clang-format on
+
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    static void Run(BatchedForwardParams& param, hipStream_t stream)
+    {
+        using ck::math::min;
+
+        // compile-time constants which don't depend on head-dim switching
+        constexpr ck::index_t thread_slice_length_ak1 =
+            GemmOpConstantsBatchedForward::AK1 /
+            GemmOpConstantsBatchedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+        constexpr ck::index_t thread_slice_length_bk1 =
+            GemmOpConstantsBatchedForward::BK1 /
+            GemmOpConstantsBatchedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+                      "ThreadClusterLengths!");
+
+        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+            min(8, thread_slice_length_ak1);
+
+        BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+            constexpr ck::index_t thread_slice_length_gemm1n =
+                kGemm1NPerBlock /
+                GemmOpConstantsBatchedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(
+                    I1);
+            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_gemm1n);
+
+            constexpr ck::index_t thread_slice_length_cshuflle_n =
+                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                GemmOpConstantsBatchedForward::
+                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
+
+            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                min(4, thread_slice_length_cshuflle_n);
+
+            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
+                         kCShuffleBlockTransferScalarPerVector_max)
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kB1BlockTransferSrcScalarPerVector_max,
+                               kB1BlockTransferSrcScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                                       min(kB1BlockTransferSrcScalarPerVector,
+                                           kCShuffleBlockTransferScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            }
+            else
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kCShuffleBlockTransferScalarPerVector_max,
+                               kCShuffleBlockTransferScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                                       min(kCShuffleBlockTransferScalarPerVector,
+                                           kB1BlockTransferSrcScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            };
+        });
     };
 
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptr,
-        param.k_ptr,
-        param.v_ptr,
-        param.out_ptr,
-        nullptr,
-        param.logsumexp_ptr,
-        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-        {}, // p_acc1_biases;
-        a_gs_ms_ks_lengths,
-        a_gs_ms_ks_strides,
-        b0_gs_ns_ks_lengths,
-        b0_gs_ns_ks_strides,
-        b1_gs_os_ns_lengths,
-        b1_gs_os_ns_strides,
-        c_gs_ms_os_lengths,
-        c_gs_ms_os_strides,
-        {1, 1, 1, 1},
-        {0, 0, 0, 0},
-        lse_gs_ms_lengths,
-        d_gs_ms_ns_lengths,
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op,
-        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-        std::tuple<int64_t, int64_t>(
-            param.philox_seed,
-            param.philox_offset)); // dropout random seed and offset
-
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
+    template <typename DeviceOpInstance>
+    static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream)
+    {
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{param.B, param.Hq, param.M, param.K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides{
+            param.q_strides[0], param.q_strides[2], param.q_strides[1], param.q_strides[3]};
+
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{param.B, param.Hkv, param.N, param.K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides{
+            param.k_strides[0], param.k_strides[2], param.k_strides[1], param.k_strides[3]};
+
+        // to be changed to b1_gs_ns_os_lengths
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{param.B, param.Hkv, param.Kv, param.N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides{
+            param.v_strides[0], param.v_strides[2], param.v_strides[3], param.v_strides[1]};
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{param.B, param.Hq, param.M, param.Kv};
+        std::vector<ck::index_t> c_gs_ms_os_strides{
+            param.out_strides[0], param.out_strides[2], param.out_strides[1], param.out_strides[3]};
+
+        std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
+
+        std::vector<ck::index_t> d_gs_ms_ns_lengths;
+        std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+        if constexpr(has_attn_bias)
+        {
+            d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
+            d_gs_ms_ns_strides = {param.attn_bias_strides[0],
+                                  param.attn_bias_strides[1],
+                                  param.attn_bias_strides[2],
+                                  param.attn_bias_strides[3]};
+        }
+        else
+        {
+            d_gs_ms_ns_lengths = {1, 1, 1, 1};
+            d_gs_ms_ns_strides = {0, 0, 0, 0};
+        };
+
+        float alpha = param.scale;
+
+        auto a_element_op    = AElementOp{};
+        auto b0_element_op   = B0ElementOp{};
+        auto acc0_element_op = Acc0ElementOp{alpha};
+        auto b1_element_op   = B1ElementOp{};
+        auto c_element_op    = CElementOp{};
+
+        auto op      = DeviceOpInstance{};
+        auto invoker = op.MakeInvoker();
+
+        auto arg_ptr = op.MakeArgumentPointer(
+            param.q_ptr,
+            param.k_ptr,
+            param.v_ptr,
+            param.out_ptr,
+            nullptr,
+            param.logsumexp_ptr,
+            param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+            {}, // p_acc1_biases;
+            a_gs_ms_ks_lengths,
+            a_gs_ms_ks_strides,
+            b0_gs_ns_ks_lengths,
+            b0_gs_ns_ks_strides,
+            b1_gs_os_ns_lengths,
+            b1_gs_os_ns_strides,
+            c_gs_ms_os_lengths,
+            c_gs_ms_os_strides,
+            {1, 1, 1, 1},
+            {0, 0, 0, 0},
+            lse_gs_ms_lengths,
+            d_gs_ms_ns_lengths,
+            d_gs_ms_ns_strides,
+            {}, // acc1_biases_gs_ms_os_lengths
+            {}, // acc1_biases_gs_ms_os_strides,
+            a_element_op,
+            b0_element_op,
+            acc0_element_op,
+            b1_element_op,
+            c_element_op,
+            param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+            std::tuple<int64_t, int64_t>(param.philox_seed,
+                                         param.philox_offset)); // dropout random seed and offset
+
+        SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+        if(!op.IsSupportedArgument(arg_ptr.get()))
+        {
+            std::ostringstream ostr;
+
+            ostr << op.GetTypeString() << " does not support this problem";
+
+            throw std::runtime_error(ostr.str());
+        }
+
+        invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+    };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_forward_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream) {
-  batched_forward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
+void run_batched_forward_masktype_attnbias_dispatched(BatchedForwardParams& param,
+                                                      hipStream_t stream)
+{
+    batched_forward_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+        param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 91d73009d..362379dd0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_forward.h"
 
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
+                                                                        hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
+                                                                        hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
+                                                                        hipStream_t stream);
+
+void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                            stream);
+        else if(param.custom_mask_type == 1)
+            run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                            stream);
+        else if(param.custom_mask_type == 2)
+            run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                            stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index 557f6fb8a..1d42798c8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_forward.h"
 
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
+
+void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                           stream);
+        else if(param.custom_mask_type == 1)
+            run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                           stream);
+        else if(param.custom_mask_type == 2)
+            run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                           stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index dfc17191b..af7c7679c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <sstream>
@@ -18,59 +24,62 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_infer_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct batched_infer_masktype_attnbias_dispatched
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using GemmDataType     = scalar_t;
+    using ADataType        = scalar_t;
+    using B0DataType       = scalar_t;
+    using B1DataType       = scalar_t;
+    using AccDataType      = F32;
+    using CShuffleDataType = F32;
+    using CDataType        = scalar_t;
+    using ZDataType        = unsigned short;
+    using LSEDataType      = F32;
+    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
+    using Acc1BiasDataType = void;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+    static constexpr auto MaskingSpec =
+        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
+
+    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_INFER_HEADDIM_SWITCH
-#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                         \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
-      constexpr ck::index_t kGemm1NPerBlock = 32;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
-      __VA_ARGS__();                                            \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
-      constexpr ck::index_t kGemm1NPerBlock = 64;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
-      __VA_ARGS__();                                            \
-    } else {                                                    \
-      constexpr ck::index_t kGemm1NPerBlock = 128;              \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
-      __VA_ARGS__();                                            \
-    }                                                           \
-  }()
+#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)         \
+    [&] {                                                               \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 128; \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+    }()
 #endif
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -144,209 +153,190 @@ struct batched_infer_masktype_attnbias_dispatched {
           GemmOpConstantsBatchedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsBatchedInfer::AK1 /
-        GemmOpConstantsBatchedInfer::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsBatchedInfer::BK1 /
-        GemmOpConstantsBatchedInfer::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
-    std::vector<ck::index_t> a_gs_ms_ks_lengths{
-        param.B, param.Hq, param.M, param.K};
-    std::vector<ck::index_t> a_gs_ms_ks_strides{
-        param.q_strides[0],
-        param.q_strides[2],
-        param.q_strides[1],
-        param.q_strides[3]};
-
-    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-        param.B, param.Hkv, param.N, param.K};
-    std::vector<ck::index_t> b0_gs_ns_ks_strides{
-        param.k_strides[0],
-        param.k_strides[2],
-        param.k_strides[1],
-        param.k_strides[3]};
-
-    // to be changed to b1_gs_ns_os_lengths
-    std::vector<ck::index_t> b1_gs_os_ns_lengths{
-        param.B, param.Hkv, param.Kv, param.N};
-    std::vector<ck::index_t> b1_gs_os_ns_strides{
-        param.v_strides[0],
-        param.v_strides[2],
-        param.v_strides[3],
-        param.v_strides[1]};
-
-    std::vector<ck::index_t> c_gs_ms_os_lengths{
-        param.B, param.Hq, param.M, param.Kv};
-    std::vector<ck::index_t> c_gs_ms_os_strides{
-        param.out_strides[0],
-        param.out_strides[2],
-        param.out_strides[1],
-        param.out_strides[3]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-      d_gs_ms_ns_strides = {
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
+    // clang-format on
+
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    static void Run(BatchedForwardParams& param, hipStream_t stream)
+    {
+        using ck::math::min;
+
+        // compile-time constants which don't depend on head-dim switching
+        constexpr ck::index_t thread_slice_length_ak1 =
+            GemmOpConstantsBatchedInfer::AK1 /
+            GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+        constexpr ck::index_t thread_slice_length_bk1 =
+            GemmOpConstantsBatchedInfer::BK1 /
+            GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+                      "ThreadClusterLengths!");
+
+        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+            min(8, thread_slice_length_ak1);
+
+        BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+            constexpr ck::index_t thread_slice_length_gemm1n =
+                kGemm1NPerBlock /
+                GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+                min(4, thread_slice_length_gemm1n);
+
+            constexpr ck::index_t thread_slice_length_cshuflle_n =
+                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                GemmOpConstantsBatchedInfer::
+                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
+
+            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                min(4, thread_slice_length_cshuflle_n);
+
+            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
+                         kCShuffleBlockTransferScalarPerVector_max)
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kB1BlockTransferSrcScalarPerVector_max,
+                               kB1BlockTransferSrcScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                                       min(kB1BlockTransferSrcScalarPerVector,
+                                           kCShuffleBlockTransferScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            }
+            else
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kCShuffleBlockTransferScalarPerVector_max,
+                               kCShuffleBlockTransferScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                                       min(kCShuffleBlockTransferScalarPerVector,
+                                           kB1BlockTransferSrcScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            };
+        });
     };
 
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptr,
-        param.k_ptr,
-        param.v_ptr,
-        param.out_ptr,
-        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-        {}, // p_acc1_biases;
-        a_gs_ms_ks_lengths,
-        a_gs_ms_ks_strides,
-        b0_gs_ns_ks_lengths,
-        b0_gs_ns_ks_strides,
-        b1_gs_os_ns_lengths,
-        b1_gs_os_ns_strides,
-        c_gs_ms_os_lengths,
-        c_gs_ms_os_strides,
-        d_gs_ms_ns_lengths,
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op);
-
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
+    template <typename DeviceOpInstance>
+    static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream)
+    {
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{param.B, param.Hq, param.M, param.K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides{
+            param.q_strides[0], param.q_strides[2], param.q_strides[1], param.q_strides[3]};
+
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{param.B, param.Hkv, param.N, param.K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides{
+            param.k_strides[0], param.k_strides[2], param.k_strides[1], param.k_strides[3]};
+
+        // to be changed to b1_gs_ns_os_lengths
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{param.B, param.Hkv, param.Kv, param.N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides{
+            param.v_strides[0], param.v_strides[2], param.v_strides[3], param.v_strides[1]};
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{param.B, param.Hq, param.M, param.Kv};
+        std::vector<ck::index_t> c_gs_ms_os_strides{
+            param.out_strides[0], param.out_strides[2], param.out_strides[1], param.out_strides[3]};
+
+        std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
+
+        std::vector<ck::index_t> d_gs_ms_ns_lengths;
+        std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+        if constexpr(has_attn_bias)
+        {
+            d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
+            d_gs_ms_ns_strides = {param.attn_bias_strides[0],
+                                  param.attn_bias_strides[1],
+                                  param.attn_bias_strides[2],
+                                  param.attn_bias_strides[3]};
+        }
+        else
+        {
+            d_gs_ms_ns_lengths = {1, 1, 1, 1};
+            d_gs_ms_ns_strides = {0, 0, 0, 0};
+        };
+
+        float alpha = param.scale;
+
+        auto a_element_op    = AElementOp{};
+        auto b0_element_op   = B0ElementOp{};
+        auto acc0_element_op = Acc0ElementOp{alpha};
+        auto b1_element_op   = B1ElementOp{};
+        auto c_element_op    = CElementOp{};
+
+        auto op      = DeviceOpInstance{};
+        auto invoker = op.MakeInvoker();
+
+        auto arg_ptr = op.MakeArgumentPointer(param.q_ptr,
+                                              param.k_ptr,
+                                              param.v_ptr,
+                                              param.out_ptr,
+                                              param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+                                              {}, // p_acc1_biases;
+                                              a_gs_ms_ks_lengths,
+                                              a_gs_ms_ks_strides,
+                                              b0_gs_ns_ks_lengths,
+                                              b0_gs_ns_ks_strides,
+                                              b1_gs_os_ns_lengths,
+                                              b1_gs_os_ns_strides,
+                                              c_gs_ms_os_lengths,
+                                              c_gs_ms_os_strides,
+                                              d_gs_ms_ns_lengths,
+                                              d_gs_ms_ns_strides,
+                                              {}, // acc1_biases_gs_ms_os_lengths
+                                              {}, // acc1_biases_gs_ms_os_strides,
+                                              a_element_op,
+                                              b0_element_op,
+                                              acc0_element_op,
+                                              b1_element_op,
+                                              c_element_op);
+
+        SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+        if(!op.IsSupportedArgument(arg_ptr.get()))
+        {
+            std::ostringstream ostr;
+
+            ostr << op.GetTypeString() << " does not support this problem";
+
+            throw std::runtime_error(ostr.str());
+        }
+
+        invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+    };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_infer_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream) {
-  batched_infer_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
+void run_batched_infer_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
+{
+    batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+        param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
index 628f7ec84..1530aad32 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_infer.h"
 
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 1)
+            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 2)
+            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
index 5e4c861c2..52b385aa2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_infer.h"
 
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 1)
+            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 2)
+            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
index 654a7f8db..6362916ae 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
@@ -1,23 +1,27 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <ck/ck.hpp>
 #include "ck_fmha_op_helper.h"
 
 // list the template parameters that is commonly used
-struct GemmOpConstantsCommon {
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
+struct GemmOpConstantsCommon
+{
+    static constexpr ck::index_t NumDimG = 2;
+    static constexpr ck::index_t NumDimM = 1;
+    static constexpr ck::index_t NumDimN = 1;
+    static constexpr ck::index_t NumDimK = 1;
+    static constexpr ck::index_t NumDimO = 1;
 
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecB0 =
+        ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecB1 =
+        ck::tensor_operation::device::TensorSpecialization::Default;
+    static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default;
 };
-
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
index c80ec4603..ab3c159b7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <ck/ck.hpp>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 71674bda7..2fb06ddd8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <sstream>
@@ -18,60 +24,56 @@
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-struct grouped_backward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-  using Scale = ck::tensor_operation::element_wise::Scale;
-
-  using QKVElementOp = PassThrough;
-  using YElementOp = PassThrough;
-
-  using InputDataType = scalar_t;
-  using OutputDataType =
-      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
-  using GemmDataType = scalar_t;
-  using AccDataType = F32;
-  using ShuffleDataType = F32;
-  using LSEDataType = F32;
-  using ZDataType = unsigned short;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr bool Deterministic = true;
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
+struct grouped_backward_masktype_attnbias_dispatched
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+
+    using QKVElementOp = PassThrough;
+    using YElementOp   = PassThrough;
+
+    using InputDataType    = scalar_t;
+    using OutputDataType   = typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
+    using GemmDataType     = scalar_t;
+    using AccDataType      = F32;
+    using ShuffleDataType  = F32;
+    using LSEDataType      = F32;
+    using ZDataType        = unsigned short;
+    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
+    using Acc1BiasDataType = void;
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+    static constexpr auto MaskingSpec =
+        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
+
+    static constexpr bool Deterministic = true;
+
+    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef GROUPED_BACKWARD_V1_HEADDIM_SWITCH
-#define GROUPED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                               \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
-      __VA_ARGS__();                                                  \
-    } else {                                                          \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
-      __VA_ARGS__();                                                  \
-    };                                                                \
-  }()
+#define GROUPED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)              \
+    [&] {                                                                          \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                                     \
+        {                                                                          \
+            constexpr ck::index_t kGemm1NPerBlock                = 32;             \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 1;              \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;              \
+            using kCShuffleBlockTransferClusterLengths           = S<1, 64, 1, 4>; \
+            __VA_ARGS__();                                                         \
+        }                                                                          \
+        else                                                                       \
+        {                                                                          \
+            constexpr ck::index_t kGemm1NPerBlock                = 64;             \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 2;              \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;              \
+            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>; \
+            __VA_ARGS__();                                                         \
+        };                                                                         \
+    }()
 #endif
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -142,9 +144,9 @@ struct grouped_backward_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-  // clang-format on
+    // clang-format on
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -223,296 +225,294 @@ struct grouped_backward_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    if (param.K <= 64 && param.Kv <= 64) {
-      // compile-time constants which don't depend on head-dim switching
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedBackward_V1::AK1 /
-          GemmOpConstantsGroupedBackward_V1::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedBackward_V1::BK1 /
-          GemmOpConstantsGroupedBackward_V1::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      GROUPED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-        constexpr ck::index_t thread_slice_length_cshuflle_n =
-            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-             kGemm1NXdlPerWave) /
-            kCShuffleBlockTransferClusterLengths::At(I3);
-
-        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-            min(2, thread_slice_length_cshuflle_n);
-
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      });
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedBackward_V2::AK1 /
-          GemmOpConstantsGroupedBackward_V2::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedBackward_V2::BK1 /
-          GemmOpConstantsGroupedBackward_V2::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedBackward_V2::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          kCShuffleBlockTransferClusterLengths::At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+    // clang-format on
+
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    static void Run(GroupedBackwardParams& param, hipStream_t stream)
+    {
+        using ck::math::min;
+
+        if(param.K <= 64 && param.Kv <= 64)
+        {
+            // compile-time constants which don't depend on head-dim switching
+            constexpr ck::index_t thread_slice_length_ak1 =
+                GemmOpConstantsGroupedBackward_V1::AK1 /
+                GemmOpConstantsGroupedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
+                    I2);
+            constexpr ck::index_t thread_slice_length_bk1 =
+                GemmOpConstantsGroupedBackward_V1::BK1 /
+                GemmOpConstantsGroupedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
+                    I2);
+
+            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+                          "and ThreadClusterLengths!");
+
+            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_ak1);
+
+            GROUPED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+                constexpr ck::index_t thread_slice_length_cshuflle_n =
+                    (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                    kCShuffleBlockTransferClusterLengths::At(I3);
+
+                constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                    min(2, thread_slice_length_cshuflle_n);
+
+                ALIGN_SWITCH_2(
+                    kABBlockTransferSrcScalarPerVector_max,
+                    kABBlockTransferSrcScalarPerVector,
+                    param.K,
+                    kCShuffleBlockTransferScalarPerVector_max,
+                    kCShuffleBlockTransferScalarPerVector,
+                    param.Kv,
+                    [&] {
+                        using DeviceOpInstance =
+                            DeviceOpInstanceTemp_V1<kGemm1NPerBlock,
+                                                    kGemm1NXdlPerWave,
+                                                    kCShuffleNXdlPerWavePerShuffle,
+                                                    kCShuffleBlockTransferClusterLengths,
+                                                    kABBlockTransferSrcScalarPerVector,
+                                                    kCShuffleBlockTransferScalarPerVector>;
+
+                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                    });
             });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+        }
+        else
+        {
+            constexpr ck::index_t kGemm1NPerBlock                = 128;
+            constexpr ck::index_t kGemm1NXdlPerWave              = 4;
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>;
+
+            constexpr ck::index_t thread_slice_length_ak1 =
+                GemmOpConstantsGroupedBackward_V2::AK1 /
+                GemmOpConstantsGroupedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
+                    I2);
+            constexpr ck::index_t thread_slice_length_bk1 =
+                GemmOpConstantsGroupedBackward_V2::BK1 /
+                GemmOpConstantsGroupedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
+                    I2);
+
+            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+                          "and ThreadClusterLengths!");
+
+            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_ak1);
+
+            constexpr ck::index_t thread_slice_length_gemm1n =
+                kGemm1NPerBlock / GemmOpConstantsGroupedBackward_V2::
+                                      B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_gemm1n);
+
+            constexpr ck::index_t thread_slice_length_cshuflle_n =
+                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                kCShuffleBlockTransferClusterLengths::At(I3);
+
+            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                min(2, thread_slice_length_cshuflle_n);
+
+            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
+                         kCShuffleBlockTransferScalarPerVector_max)
+            {
+                ALIGN_SWITCH_2(
+                    kABBlockTransferSrcScalarPerVector_max,
+                    kABBlockTransferSrcScalarPerVector,
+                    param.K,
+                    kB1BlockTransferSrcScalarPerVector_max,
+                    kB1BlockTransferSrcScalarPerVector,
+                    param.Kv,
+                    [&] {
+                        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                            min(kB1BlockTransferSrcScalarPerVector,
+                                kCShuffleBlockTransferScalarPerVector_max);
+                        using DeviceOpInstance =
+                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
+                                                    kGemm1NXdlPerWave,
+                                                    kCShuffleNXdlPerWavePerShuffle,
+                                                    kCShuffleBlockTransferClusterLengths,
+                                                    kABBlockTransferSrcScalarPerVector,
+                                                    kB1BlockTransferSrcScalarPerVector,
+                                                    kCShuffleBlockTransferScalarPerVector>;
+
+                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                    });
+            }
+            else
+            {
+                ALIGN_SWITCH_2(
+                    kABBlockTransferSrcScalarPerVector_max,
+                    kABBlockTransferSrcScalarPerVector,
+                    param.K,
+                    kCShuffleBlockTransferScalarPerVector_max,
+                    kCShuffleBlockTransferScalarPerVector,
+                    param.Kv,
+                    [&] {
+                        constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                            min(kCShuffleBlockTransferScalarPerVector,
+                                kB1BlockTransferSrcScalarPerVector_max);
+                        using DeviceOpInstance =
+                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
+                                                    kGemm1NXdlPerWave,
+                                                    kCShuffleNXdlPerWavePerShuffle,
+                                                    kCShuffleBlockTransferClusterLengths,
+                                                    kABBlockTransferSrcScalarPerVector,
+                                                    kB1BlockTransferSrcScalarPerVector,
+                                                    kCShuffleBlockTransferScalarPerVector>;
+
+                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                    });
+            };
+        };
+    };
+
+    template <typename DeviceOpInstance>
+    static void RunWithDeviceOp(GroupedBackwardParams& param, hipStream_t stream)
+    {
+        // Tunables
+        std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+        for(std::size_t i = 0; i < param.num_batches; i++)
+        {
+            int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
+            int N = param.host_seqlen_k.empty()
+                        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+                        : param.host_seqlen_k[i];
+            int K    = param.K;
+            int Kv   = param.Kv;
+            int G1q  = param.Hq;
+            int G1kv = param.Hkv;
+
+            std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1q, M, K};
+            std::vector<ck::index_t> q_gs_ms_ks_strides{
+                0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+            std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1kv, N, K};
+            std::vector<ck::index_t> k_gs_ns_ks_strides{
+                0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+            std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {1, G1q, N, K};
+            std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {0,
+                                                               param.tmp_grad_k_strides[1],
+                                                               param.tmp_grad_k_strides[0],
+                                                               param.tmp_grad_k_strides[2]};
+
+            // to be changed to v_gs_ns_os_lengths
+            std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1kv, Kv, N};
+            std::vector<ck::index_t> v_gs_os_ns_strides{
+                0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+            std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {1, G1q, Kv, N};
+            std::vector<ck::index_t> vgrad_gs_os_ns_strides = {0,
+                                                               param.tmp_grad_v_strides[1],
+                                                               param.tmp_grad_v_strides[2],
+                                                               param.tmp_grad_v_strides[0]};
+
+            std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1q, M, Kv};
+            std::vector<ck::index_t> y_gs_ms_os_strides{
+                0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+            std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
+            std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+            std::vector<ck::index_t> d_gs_ms_ns_lengths;
+            std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+            if constexpr(has_attn_bias)
+            {
+                d_gs_ms_ns_lengths = {1, G1q, M, N};
+                d_gs_ms_ns_strides = {0,
+                                      param.attn_bias_strides[0],
+                                      param.attn_bias_strides[1],
+                                      param.attn_bias_strides[2]};
+            }
+            else
+            {
+                d_gs_ms_ns_lengths = {1, 1, 1, 1};
+                d_gs_ms_ns_strides = {0, 0, 0, 0};
+            };
+
+            problem_descs.push_back({
+                q_gs_ms_ks_lengths, // q, dQ should have same shape
+                q_gs_ms_ks_strides,
+                k_gs_ns_ks_lengths, // k, dK should have same shape
+                k_gs_ns_ks_strides,
+                {1, 1, 1, 1},
+                {0, 0, 0, 0},
+                v_gs_os_ns_lengths, // v, dV should have same shape
+                v_gs_os_ns_strides,
+                y_gs_ms_os_lengths, // y, dY should have same shape
+                y_gs_ms_os_strides,
+                lse_gs_ms_lengths,
+                lse_gs_ms_strides,
+                param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
+                param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
+                param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
+                param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
+                d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
+                d_gs_ms_ns_strides,
+                {}, // acc1_biases_gs_ms_os_lengths
+                {}, // acc1_biases_gs_ms_os_strides
             });
-      };
+        }
+
+        float alpha = param.scale;
+
+        auto op      = DeviceOpInstance{};
+        auto invoker = op.MakeInvoker();
+
+        auto arg_ptr = op.MakeArgumentPointer(
+            param.q_ptrs,
+            param.k_ptrs,
+            param.randvals_ptrs,
+            param.v_ptrs,
+            param.out_ptrs,
+            param.logsumexp_ptrs,
+            param.grad_out_ptrs,
+            param.grad_q_ptrs,
+            param.grad_k_ptrs,
+            param.grad_v_ptrs,
+            param.attn_bias_ptrs,
+            {}, // p_acc1_bias_vec;
+            param.grad_bias_ptrs,
+            {},
+            problem_descs,
+            QKVElementOp{},
+            QKVElementOp{},
+            Scale{alpha},
+            QKVElementOp{},
+            YElementOp{},
+            param.dropout_prob,
+            std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+        SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+        if(!op.IsSupportedArgument(arg_ptr.get()))
+        {
+            std::ostringstream ostr;
+            ostr << op.GetTypeString() << " does not support this problem";
+
+            throw std::runtime_error(ostr.str());
+        }
+
+        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
     };
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(
-      GroupedBackwardParams& param,
-      hipStream_t stream) {
-    // Tunables
-    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-    for (std::size_t i = 0; i < param.num_batches; i++) {
-      int M =
-          param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
-      int N = param.host_seqlen_k.empty()
-          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-          : param.host_seqlen_k[i];
-      int K = param.K;
-      int Kv = param.Kv;
-      int G1q = param.Hq;
-      int G1kv = param.Hkv;
-
-      std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1q, M, K};
-      std::vector<ck::index_t> q_gs_ms_ks_strides{
-          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-      std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1kv, N, K};
-      std::vector<ck::index_t> k_gs_ns_ks_strides{
-          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-      std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {1, G1q, N, K};
-      std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
-          0,
-          param.tmp_grad_k_strides[1],
-          param.tmp_grad_k_strides[0],
-          param.tmp_grad_k_strides[2]};
-
-      // to be changed to v_gs_ns_os_lengths
-      std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1kv, Kv, N};
-      std::vector<ck::index_t> v_gs_os_ns_strides{
-          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-      std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {1, G1q, Kv, N};
-      std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
-          0,
-          param.tmp_grad_v_strides[1],
-          param.tmp_grad_v_strides[2],
-          param.tmp_grad_v_strides[0]};
-
-      std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1q, M, Kv};
-      std::vector<ck::index_t> y_gs_ms_os_strides{
-          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
-      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-      std::vector<ck::index_t> d_gs_ms_ns_lengths;
-      std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-      if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1q, M, N};
-        d_gs_ms_ns_strides = {
-            0,
-            param.attn_bias_strides[0],
-            param.attn_bias_strides[1],
-            param.attn_bias_strides[2]};
-
-      } else {
-        d_gs_ms_ns_lengths = {1, 1, 1, 1};
-        d_gs_ms_ns_strides = {0, 0, 0, 0};
-      };
-
-      problem_descs.push_back({
-          q_gs_ms_ks_lengths, // q, dQ should have same shape
-          q_gs_ms_ks_strides,
-          k_gs_ns_ks_lengths, // k, dK should have same shape
-          k_gs_ns_ks_strides,
-          {1, 1, 1, 1},
-          {0, 0, 0, 0},
-          v_gs_os_ns_lengths, // v, dV should have same shape
-          v_gs_os_ns_strides,
-          y_gs_ms_os_lengths, // y, dY should have same shape
-          y_gs_ms_os_strides,
-          lse_gs_ms_lengths,
-          lse_gs_ms_strides,
-          param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
-          param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
-          param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
-          param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
-          d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-          d_gs_ms_ns_strides,
-          {}, // acc1_biases_gs_ms_os_lengths
-          {}, // acc1_biases_gs_ms_os_strides
-      });
-    }
-
-    float alpha = param.scale;
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptrs,
-        param.k_ptrs,
-        param.randvals_ptrs,
-        param.v_ptrs,
-        param.out_ptrs,
-        param.logsumexp_ptrs,
-        param.grad_out_ptrs,
-        param.grad_q_ptrs,
-        param.grad_k_ptrs,
-        param.grad_v_ptrs,
-        param.attn_bias_ptrs,
-        {}, // p_acc1_bias_vec;
-        param.grad_bias_ptrs,
-        {},
-        problem_descs,
-        QKVElementOp{},
-        QKVElementOp{},
-        Scale{alpha},
-        QKVElementOp{},
-        YElementOp{},
-        param.dropout_prob,
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
 };
 
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-void run_grouped_backward_masktype_attnbias_dispatched(
-    GroupedBackwardParams& param,
-    hipStream_t stream) {
-  grouped_backward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias,
-      use_fp32_qkv_grad>::Run(param, stream);
+template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
+void run_grouped_backward_masktype_attnbias_dispatched(GroupedBackwardParams& param,
+                                                       hipStream_t stream)
+{
+    grouped_backward_masktype_attnbias_dispatched<scalar_t,
+                                                  custom_mask_type,
+                                                  has_attn_bias,
+                                                  use_fp32_qkv_grad>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index 89a73b3d1..7d4458899 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -1,107 +1,80 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_backward.h"
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void
+run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void
+run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void
+run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 1) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 2) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
+void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_2(
+        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
+            if(param.custom_mask_type == 0)
+            {
+                run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t,
+                                                                  0,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            }
+            else if(param.custom_mask_type == 1)
+            {
+                run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t,
+                                                                  1,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            }
+            else if(param.custom_mask_type == 2)
+            {
+                run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t,
+                                                                  2,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            }
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index c0e35f63d..a89291891 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -1,107 +1,77 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_backward.h"
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
 
-void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 1) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 2) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
+void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_2(
+        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
+            if(param.custom_mask_type == 0)
+            {
+                run_grouped_backward_masktype_attnbias_dispatched<ck::half_t,
+                                                                  0,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            }
+            else if(param.custom_mask_type == 1)
+            {
+                run_grouped_backward_masktype_attnbias_dispatched<ck::half_t,
+                                                                  1,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            }
+            else if(param.custom_mask_type == 2)
+            {
+                run_grouped_backward_masktype_attnbias_dispatched<ck::half_t,
+                                                                  2,
+                                                                  HAS_ATTN_BIAS,
+                                                                  USE_FP32_QKV_GRAD>(param, stream);
+            }
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 00c92682b..997b92dd6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <sstream>
@@ -18,59 +24,62 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_forward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct grouped_forward_masktype_attnbias_dispatched
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using GemmDataType     = scalar_t;
+    using ADataType        = scalar_t;
+    using B0DataType       = scalar_t;
+    using B1DataType       = scalar_t;
+    using AccDataType      = F32;
+    using CShuffleDataType = F32;
+    using CDataType        = scalar_t;
+    using ZDataType        = unsigned short;
+    using LSEDataType      = F32;
+    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
+    using Acc1BiasDataType = void;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+    static constexpr auto MaskingSpec =
+        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
+
+    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef GROUPED_FORWARD_HEADDIM_SWITCH
-#define GROUPED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                           \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-      __VA_ARGS__();                                              \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-      __VA_ARGS__();                                              \
-    } else {                                                      \
-      constexpr ck::index_t kGemm1NPerBlock = 128;                \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-      __VA_ARGS__();                                              \
-    }                                                             \
-  }()
+#define GROUPED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)       \
+    [&] {                                                               \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 128; \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+    }()
 #endif
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -149,221 +158,220 @@ struct grouped_forward_masktype_attnbias_dispatched {
           kCShuffleBlockTransferScalarPerVector,
           GemmOpConstantsGroupedForward::Acc1BiasTransferSrcScalarPerVector,
           MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsGroupedForward::AK1 /
-        GemmOpConstantsGroupedForward::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsGroupedForward::BK1 /
-        GemmOpConstantsGroupedForward::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedForward::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsGroupedForward::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
-    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-    for (std::size_t i = 0; i < param.num_batches; i++) {
-      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-      int N = param.host_seqlen_k.empty()
-          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-          : param.host_seqlen_k[i];
-      int K = param.K;
-      int Kv = param.Kv;
-      int G1q = param.Hq;
-      int G1kv = param.Hkv;
-
-      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
-      std::vector<ck::index_t> a_gs_ms_ks_strides{
-          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
-      std::vector<ck::index_t> b0_gs_ns_ks_strides{
-          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-      // to be changed to b1_gs_ns_os_lengths
-      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
-      std::vector<ck::index_t> b1_gs_os_ns_strides{
-          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
-      std::vector<ck::index_t> c_gs_ms_os_strides{
-          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
-      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-      std::vector<ck::index_t> d_gs_ms_ns_lengths;
-      std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-      if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1q, M, N};
-        d_gs_ms_ns_strides = {
-            0,
-            param.attn_bias_strides[0],
-            param.attn_bias_strides[1],
-            param.attn_bias_strides[2]};
-
-      } else {
-        d_gs_ms_ns_lengths = {1, 1, 1, 1};
-        d_gs_ms_ns_strides = {0, 0, 0, 0};
-      };
-
-      problem_descs.push_back(
-          {a_gs_ms_ks_lengths,
-           a_gs_ms_ks_strides,
-           b0_gs_ns_ks_lengths,
-           b0_gs_ns_ks_strides,
-           b1_gs_os_ns_lengths,
-           b1_gs_os_ns_strides,
-           c_gs_ms_os_lengths,
-           c_gs_ms_os_strides,
-           {1, 1, 1, 1},
-           {0, 0, 0, 0},
-           lse_gs_ms_lengths,
-           lse_gs_ms_strides,
-           d_gs_ms_ns_lengths,
-           d_gs_ms_ns_strides,
-           {}, // acc1_bias_gs_ms_os_lengths
-           {}}); // acc1_bias_gs_ms_os_strides
-    }
-
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptrs,
-        param.k_ptrs,
-        param.v_ptrs,
-        param.out_ptrs,
-        param.randvals_ptrs,
-        param.logsumexp_ptrs,
-        param.attn_bias_ptrs,
-        {}, // p_acc1_biases
-        problem_descs,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op,
-        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-    SimpleDeviceMem workspace(sizeInBytes);
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
+    // clang-format on
+
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    static void Run(GroupedForwardParams& param, hipStream_t stream)
+    {
+        using ck::math::min;
+
+        // compile-time constants which don't depend on head-dim switching
+        constexpr ck::index_t thread_slice_length_ak1 =
+            GemmOpConstantsGroupedForward::AK1 /
+            GemmOpConstantsGroupedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+        constexpr ck::index_t thread_slice_length_bk1 =
+            GemmOpConstantsGroupedForward::BK1 /
+            GemmOpConstantsGroupedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+                      "ThreadClusterLengths!");
+
+        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+            min(8, thread_slice_length_ak1);
+
+        GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+            constexpr ck::index_t thread_slice_length_gemm1n =
+                kGemm1NPerBlock /
+                GemmOpConstantsGroupedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(
+                    I1);
+            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+                min(2, thread_slice_length_gemm1n);
+
+            constexpr ck::index_t thread_slice_length_cshuflle_n =
+                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                GemmOpConstantsGroupedForward::
+                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
+
+            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                min(4, thread_slice_length_cshuflle_n);
+
+            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
+                         kCShuffleBlockTransferScalarPerVector_max)
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kB1BlockTransferSrcScalarPerVector_max,
+                               kB1BlockTransferSrcScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                                       min(kB1BlockTransferSrcScalarPerVector,
+                                           kCShuffleBlockTransferScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            }
+            else
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kCShuffleBlockTransferScalarPerVector_max,
+                               kCShuffleBlockTransferScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                                       min(kCShuffleBlockTransferScalarPerVector,
+                                           kB1BlockTransferSrcScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            };
+        });
+    };
+
+    template <typename DeviceOpInstance>
+    static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream)
+    {
+        std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+        for(std::size_t i = 0; i < param.num_batches; i++)
+        {
+            int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+            int N = param.host_seqlen_k.empty()
+                        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+                        : param.host_seqlen_k[i];
+            int K    = param.K;
+            int Kv   = param.Kv;
+            int G1q  = param.Hq;
+            int G1kv = param.Hkv;
+
+            std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
+            std::vector<ck::index_t> a_gs_ms_ks_strides{
+                0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+            std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
+            std::vector<ck::index_t> b0_gs_ns_ks_strides{
+                0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+            // to be changed to b1_gs_ns_os_lengths
+            std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
+            std::vector<ck::index_t> b1_gs_os_ns_strides{
+                0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+            std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
+            std::vector<ck::index_t> c_gs_ms_os_strides{
+                0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+            std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
+            std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+            std::vector<ck::index_t> d_gs_ms_ns_lengths;
+            std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+            if constexpr(has_attn_bias)
+            {
+                d_gs_ms_ns_lengths = {1, G1q, M, N};
+                d_gs_ms_ns_strides = {0,
+                                      param.attn_bias_strides[0],
+                                      param.attn_bias_strides[1],
+                                      param.attn_bias_strides[2]};
+            }
+            else
+            {
+                d_gs_ms_ns_lengths = {1, 1, 1, 1};
+                d_gs_ms_ns_strides = {0, 0, 0, 0};
+            };
+
+            problem_descs.push_back({a_gs_ms_ks_lengths,
+                                     a_gs_ms_ks_strides,
+                                     b0_gs_ns_ks_lengths,
+                                     b0_gs_ns_ks_strides,
+                                     b1_gs_os_ns_lengths,
+                                     b1_gs_os_ns_strides,
+                                     c_gs_ms_os_lengths,
+                                     c_gs_ms_os_strides,
+                                     {1, 1, 1, 1},
+                                     {0, 0, 0, 0},
+                                     lse_gs_ms_lengths,
+                                     lse_gs_ms_strides,
+                                     d_gs_ms_ns_lengths,
+                                     d_gs_ms_ns_strides,
+                                     {},   // acc1_bias_gs_ms_os_lengths
+                                     {}}); // acc1_bias_gs_ms_os_strides
+        }
+
+        float alpha = param.scale;
+
+        auto a_element_op    = AElementOp{};
+        auto b0_element_op   = B0ElementOp{};
+        auto acc0_element_op = Acc0ElementOp{alpha};
+        auto b1_element_op   = B1ElementOp{};
+        auto c_element_op    = CElementOp{};
+
+        auto op      = DeviceOpInstance{};
+        auto invoker = op.MakeInvoker();
+
+        auto arg_ptr = op.MakeArgumentPointer(
+            param.q_ptrs,
+            param.k_ptrs,
+            param.v_ptrs,
+            param.out_ptrs,
+            param.randvals_ptrs,
+            param.logsumexp_ptrs,
+            param.attn_bias_ptrs,
+            {}, // p_acc1_biases
+            problem_descs,
+            a_element_op,
+            b0_element_op,
+            acc0_element_op,
+            b1_element_op,
+            c_element_op,
+            param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+            std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+        auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
+
+        SimpleDeviceMem workspace(sizeInBytes);
+
+        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+        if(!op.IsSupportedArgument(arg_ptr.get()))
+        {
+            std::ostringstream ostr;
+
+            ostr << op.GetTypeString() << " does not support this problem";
+
+            throw std::runtime_error(ostr.str());
+        }
+
+        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+    };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_forward_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream) {
-  grouped_forward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
+void run_grouped_forward_masktype_attnbias_dispatched(GroupedForwardParams& param,
+                                                      hipStream_t stream)
+{
+    grouped_forward_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+        param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index 030158809..6679f8731 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_forward.h"
 
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
+                                                                        hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
+                                                                        hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
+                                                                        hipStream_t stream);
+
+void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                            stream);
+        else if(param.custom_mask_type == 1)
+            run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                            stream);
+        else if(param.custom_mask_type == 2)
+            run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                            stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index 5338eab35..70a295cec 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_forward.h"
 
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
+
+void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                           stream);
+        else if(param.custom_mask_type == 1)
+            run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                           stream);
+        else if(param.custom_mask_type == 2)
+            run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                           stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 81c6d3381..08e5434d7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <sstream>
@@ -18,59 +24,62 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_infer_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct grouped_infer_masktype_attnbias_dispatched
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using GemmDataType     = scalar_t;
+    using ADataType        = scalar_t;
+    using B0DataType       = scalar_t;
+    using B1DataType       = scalar_t;
+    using AccDataType      = F32;
+    using CShuffleDataType = F32;
+    using CDataType        = scalar_t;
+    using ZDataType        = unsigned short;
+    using LSEDataType      = F32;
+    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
+    using Acc1BiasDataType = void;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+    static constexpr auto MaskingSpec =
+        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
+
+    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef GROUPED_INFER_HEADDIM_SWITCH
-#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                         \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
-      constexpr ck::index_t kGemm1NPerBlock = 32;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
-      __VA_ARGS__();                                            \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
-      constexpr ck::index_t kGemm1NPerBlock = 64;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
-      __VA_ARGS__();                                            \
-    } else {                                                    \
-      constexpr ck::index_t kGemm1NPerBlock = 128;              \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
-      __VA_ARGS__();                                            \
-    }                                                           \
-  }()
+#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)         \
+    [&] {                                                               \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            constexpr ck::index_t kGemm1NPerBlock                = 128; \
+            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
+            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+            __VA_ARGS__();                                              \
+        }                                                               \
+    }()
 #endif
 
-  // clang-format off
+    // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -144,210 +153,206 @@ struct grouped_infer_masktype_attnbias_dispatched {
           GemmOpConstantsGroupedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsGroupedInfer::AK1 /
-        GemmOpConstantsGroupedInfer::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsGroupedInfer::BK1 /
-        GemmOpConstantsGroupedInfer::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsGroupedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
-    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-    for (std::size_t i = 0; i < param.num_batches; i++) {
-      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-      int N = param.host_seqlen_k.empty()
-          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-          : param.host_seqlen_k[i];
-      int K = param.K;
-      int Kv = param.Kv;
-      int G1q = param.Hq;
-      int G1kv = param.Hkv;
-
-      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
-      std::vector<ck::index_t> a_gs_ms_ks_strides{
-          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
-      std::vector<ck::index_t> b0_gs_ns_ks_strides{
-          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-      // to be changed to b1_gs_ns_os_lengths
-      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
-      std::vector<ck::index_t> b1_gs_os_ns_strides{
-          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
-      std::vector<ck::index_t> c_gs_ms_os_strides{
-          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-      std::vector<ck::index_t> d_gs_ms_ns_lengths;
-      std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-      if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1q, M, N};
-        d_gs_ms_ns_strides = {
-            0,
-            param.attn_bias_strides[0],
-            param.attn_bias_strides[1],
-            param.attn_bias_strides[2]};
-
-      } else {
-        d_gs_ms_ns_lengths = {1, 1, 1, 1};
-        d_gs_ms_ns_strides = {0, 0, 0, 0};
-      };
-
-      problem_descs.push_back(
-          {a_gs_ms_ks_lengths,
-           a_gs_ms_ks_strides,
-           b0_gs_ns_ks_lengths,
-           b0_gs_ns_ks_strides,
-           b1_gs_os_ns_lengths,
-           b1_gs_os_ns_strides,
-           c_gs_ms_os_lengths,
-           c_gs_ms_os_strides,
-           d_gs_ms_ns_lengths,
-           d_gs_ms_ns_strides,
-           {}, // acc1_bias_gs_ms_os_lengths
-           {}}); // acc1_bias_gs_ms_os_strides
-    }
-
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptrs,
-        param.k_ptrs,
-        param.v_ptrs,
-        param.out_ptrs,
-        param.attn_bias_ptrs,
-        {}, // p_acc1_biases
-        problem_descs,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op);
-
-    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-    SimpleDeviceMem workspace(sizeInBytes);
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
+    // clang-format on
+
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    static void Run(GroupedForwardParams& param, hipStream_t stream)
+    {
+        using ck::math::min;
+
+        // compile-time constants which don't depend on head-dim switching
+        constexpr ck::index_t thread_slice_length_ak1 =
+            GemmOpConstantsGroupedInfer::AK1 /
+            GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+        constexpr ck::index_t thread_slice_length_bk1 =
+            GemmOpConstantsGroupedInfer::BK1 /
+            GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
+                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+                      "ThreadClusterLengths!");
+
+        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+            min(8, thread_slice_length_ak1);
+
+        GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+            constexpr ck::index_t thread_slice_length_gemm1n =
+                kGemm1NPerBlock /
+                GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+                min(4, thread_slice_length_gemm1n);
+
+            constexpr ck::index_t thread_slice_length_cshuflle_n =
+                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
+                GemmOpConstantsGroupedInfer::
+                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
+
+            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+                min(4, thread_slice_length_cshuflle_n);
+
+            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
+                         kCShuffleBlockTransferScalarPerVector_max)
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kB1BlockTransferSrcScalarPerVector_max,
+                               kB1BlockTransferSrcScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                                       min(kB1BlockTransferSrcScalarPerVector,
+                                           kCShuffleBlockTransferScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            }
+            else
+            {
+                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
+                               kABBlockTransferSrcScalarPerVector,
+                               param.K,
+                               kCShuffleBlockTransferScalarPerVector_max,
+                               kCShuffleBlockTransferScalarPerVector,
+                               param.Kv,
+                               [&] {
+                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                                       min(kCShuffleBlockTransferScalarPerVector,
+                                           kB1BlockTransferSrcScalarPerVector_max);
+                                   using DeviceOpInstance =
+                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
+                                                            kGemm1NXdlPerWave,
+                                                            kCShuffleNXdlPerWavePerShuffle,
+                                                            kABBlockTransferSrcScalarPerVector,
+                                                            kB1BlockTransferSrcScalarPerVector,
+                                                            kCShuffleBlockTransferScalarPerVector>;
+
+                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
+                               });
+            };
+        });
+    };
+
+    template <typename DeviceOpInstance>
+    static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream)
+    {
+        std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+        for(std::size_t i = 0; i < param.num_batches; i++)
+        {
+            int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+            int N = param.host_seqlen_k.empty()
+                        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+                        : param.host_seqlen_k[i];
+            int K    = param.K;
+            int Kv   = param.Kv;
+            int G1q  = param.Hq;
+            int G1kv = param.Hkv;
+
+            std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
+            std::vector<ck::index_t> a_gs_ms_ks_strides{
+                0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+            std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
+            std::vector<ck::index_t> b0_gs_ns_ks_strides{
+                0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+            // to be changed to b1_gs_ns_os_lengths
+            std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
+            std::vector<ck::index_t> b1_gs_os_ns_strides{
+                0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+            std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
+            std::vector<ck::index_t> c_gs_ms_os_strides{
+                0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+            std::vector<ck::index_t> d_gs_ms_ns_lengths;
+            std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+            if constexpr(has_attn_bias)
+            {
+                d_gs_ms_ns_lengths = {1, G1q, M, N};
+                d_gs_ms_ns_strides = {0,
+                                      param.attn_bias_strides[0],
+                                      param.attn_bias_strides[1],
+                                      param.attn_bias_strides[2]};
+            }
+            else
+            {
+                d_gs_ms_ns_lengths = {1, 1, 1, 1};
+                d_gs_ms_ns_strides = {0, 0, 0, 0};
+            };
+
+            problem_descs.push_back({a_gs_ms_ks_lengths,
+                                     a_gs_ms_ks_strides,
+                                     b0_gs_ns_ks_lengths,
+                                     b0_gs_ns_ks_strides,
+                                     b1_gs_os_ns_lengths,
+                                     b1_gs_os_ns_strides,
+                                     c_gs_ms_os_lengths,
+                                     c_gs_ms_os_strides,
+                                     d_gs_ms_ns_lengths,
+                                     d_gs_ms_ns_strides,
+                                     {},   // acc1_bias_gs_ms_os_lengths
+                                     {}}); // acc1_bias_gs_ms_os_strides
+        }
+
+        float alpha = param.scale;
+
+        auto a_element_op    = AElementOp{};
+        auto b0_element_op   = B0ElementOp{};
+        auto acc0_element_op = Acc0ElementOp{alpha};
+        auto b1_element_op   = B1ElementOp{};
+        auto c_element_op    = CElementOp{};
+
+        auto op      = DeviceOpInstance{};
+        auto invoker = op.MakeInvoker();
+
+        auto arg_ptr = op.MakeArgumentPointer(param.q_ptrs,
+                                              param.k_ptrs,
+                                              param.v_ptrs,
+                                              param.out_ptrs,
+                                              param.attn_bias_ptrs,
+                                              {}, // p_acc1_biases
+                                              problem_descs,
+                                              a_element_op,
+                                              b0_element_op,
+                                              acc0_element_op,
+                                              b1_element_op,
+                                              c_element_op);
+
+        auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
+
+        SimpleDeviceMem workspace(sizeInBytes);
+
+        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+        if(!op.IsSupportedArgument(arg_ptr.get()))
+        {
+            std::ostringstream ostr;
+
+            ostr << op.GetTypeString() << " does not support this problem";
+
+            throw std::runtime_error(ostr.str());
+        }
+
+        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+    };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_infer_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream) {
-  grouped_infer_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
+void run_grouped_infer_masktype_attnbias_dispatched(GroupedForwardParams& param, hipStream_t stream)
+{
+    grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+        param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
index 56c974264..5d91ad4a1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_infer.h"
 
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 1)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 2)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
index 0ca1c3eba..cd7dbb977 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
@@ -1,57 +1,52 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_infer.h"
 
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 1)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else if(param.custom_mask_type == 2)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                         stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
index bdeb5ef85..0b7708fe0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <ck/ck.hpp>
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
index 84d585a29..f9cd1a49c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <map>
@@ -7,33 +13,34 @@
 #include <ck/utility/data_type.hpp>
 
 template <typename scalar_t>
-struct MaxVectorSizeForType {
-  static constexpr int value = 4;
+struct MaxVectorSizeForType
+{
+    static constexpr int value = 4;
 };
 
 template <>
-struct MaxVectorSizeForType<ck::half_t> {
-  static constexpr int value = 8;
+struct MaxVectorSizeForType<ck::half_t>
+{
+    static constexpr int value = 8;
 };
 
 template <>
-struct MaxVectorSizeForType<ck::bhalf_t> {
-  static constexpr int value = 8;
+struct MaxVectorSizeForType<ck::bhalf_t>
+{
+    static constexpr int value = 8;
 };
 
-struct SimpleDeviceMem {
-  SimpleDeviceMem() = delete;
-  SimpleDeviceMem(size_t sizeInBytes) {
-    pData_ = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
-  }
-  void* GetDeviceBuffer() {
-    return pData_;
-  }
-  ~SimpleDeviceMem() {
-    c10::cuda::HIPCachingAllocator::raw_delete(pData_);
-  }
-
-  void* pData_;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(size_t sizeInBytes)
+    {
+        pData_ = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
+    }
+    void* GetDeviceBuffer() { return pData_; }
+    ~SimpleDeviceMem() { c10::cuda::HIPCachingAllocator::raw_delete(pData_); }
+
+    void* pData_;
 };
 
 // useful aliasing for making the codes easy
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index 7f86dd904..a741d28b9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -1,206 +1,218 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <array>
 #include <cstdint>
 
-struct BatchedInferParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-
-  // BMHK mode strides
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> out_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* out_ptr;
+struct BatchedInferParams
+{
+    int B;   // batch size
+    int M;   // seq_len for Query
+    int N;   // seq_len for Key and Value
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    float scale;
+    bool has_attn_bias;
+
+    // BMHK mode strides
+    std::array<int, 4> q_strides;
+    std::array<int, 4> k_strides;
+    std::array<int, 4> v_strides;
+    std::array<int, 4> out_strides;
+    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* attn_bias_ptr;
+
+    uint8_t custom_mask_type;
+
+    void* out_ptr;
 };
 
-struct BatchedForwardParams : public BatchedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
+struct BatchedForwardParams : public BatchedInferParams
+{
+    bool use_dropout;
+    bool compute_logsumexp;
 
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
 
-  // completely contiguous
-  void* logsumexp_ptr;
+    // completely contiguous
+    void* logsumexp_ptr;
 };
 
-struct GroupedInferParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  int max_seqlen_q;
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<void*> out_ptrs;
-
-  uint8_t custom_mask_type;
+struct GroupedInferParams
+{
+    int num_batches;
+    int M;   // total seq_len for all queries in the batch
+    int N;   // total seq_len for all keys/values in the batch
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    int max_seqlen_q;
+
+    std::vector<int> host_seqstart_q;
+    std::vector<int> host_seqstart_k;
+    std::vector<int> host_seqlen_k;
+
+    float scale;
+    bool has_attn_bias;
+
+    // MHK mode strides, last-dim contiguous
+    std::array<int, 3> q_strides;
+    std::array<int, 3> k_strides;
+    std::array<int, 3> v_strides;
+    std::array<int, 3> out_strides;
+
+    // 4d tensor view [B, H, M, N]
+    std::array<int, 4> attn_bias_strides;
+
+    std::vector<const void*> q_ptrs;
+    std::vector<const void*> k_ptrs;
+    std::vector<const void*> v_ptrs;
+    std::vector<const void*> attn_bias_ptrs;
+    std::vector<void*> out_ptrs;
+
+    uint8_t custom_mask_type;
 };
 
-struct GroupedForwardParams : public GroupedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
+struct GroupedForwardParams : public GroupedInferParams
+{
+    bool use_dropout;
+    bool compute_logsumexp;
 
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
 
-  // completely contiguous
-  std::vector<void*> logsumexp_ptrs;
+    // completely contiguous
+    std::vector<void*> logsumexp_ptrs;
 
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
+    // TODO: need remove this after dev-op fix
+    std::vector<void*> randvals_ptrs;
 };
 
-struct BatchedBackwardParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-  bool bias_has_grad;
-
-  bool use_fp32_qkv_grad;
-  bool is_mqa_gqa;
-
-  // BMHK mode strides, last-dim contiguous
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-  std::array<int, 4> out_strides;
-
-  std::array<int, 4> tmp_grad_k_strides;
-  std::array<int, 4> tmp_grad_v_strides;
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-  const void* grad_out_ptr;
-  const void* out_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* grad_q_ptr;
-  void* grad_k_ptr;
-  void* grad_v_ptr;
-  void* grad_bias_ptr;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // BHM mode lengths, completely contiguous
-  const void* logsumexp_ptr;
+struct BatchedBackwardParams
+{
+    int B;   // batch size
+    int M;   // seq_len for Query
+    int N;   // seq_len for Key and Value
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    float scale;
+    bool has_attn_bias;
+    bool bias_has_grad;
+
+    bool use_fp32_qkv_grad;
+    bool is_mqa_gqa;
+
+    // BMHK mode strides, last-dim contiguous
+    std::array<int, 4> q_strides;
+    std::array<int, 4> k_strides;
+    std::array<int, 4> v_strides;
+    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+    std::array<int, 4> out_strides;
+
+    std::array<int, 4> tmp_grad_k_strides;
+    std::array<int, 4> tmp_grad_v_strides;
+
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* attn_bias_ptr;
+    const void* grad_out_ptr;
+    const void* out_ptr;
+
+    uint8_t custom_mask_type;
+
+    void* grad_q_ptr;
+    void* grad_k_ptr;
+    void* grad_v_ptr;
+    void* grad_bias_ptr;
+
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
+
+    // BHM mode lengths, completely contiguous
+    const void* logsumexp_ptr;
 };
 
-struct GroupedBackwardParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  int max_seqlen_q;
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-  bool bias_has_grad;
-
-  bool use_fp32_qkv_grad;
-  bool is_mqa_gqa;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::array<int, 3> tmp_grad_k_strides;
-  std::array<int, 3> tmp_grad_v_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<const void*> grad_out_ptrs;
-  std::vector<const void*> out_ptrs;
-
-  // used by the light_v2 kernel
-  // TODO use these as workspace
-  std::vector<void*> ydotdy_ptrs;
-
-  uint8_t custom_mask_type;
-
-  std::vector<void*> grad_q_ptrs;
-  std::vector<void*> grad_k_ptrs;
-  std::vector<void*> grad_v_ptrs;
-  std::vector<void*> grad_bias_ptrs;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // BHM mode lengths, completely contiguous
-  std::vector<const void*> logsumexp_ptrs;
-
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
+struct GroupedBackwardParams
+{
+    int num_batches;
+    int M;   // total seq_len for all queries in the batch
+    int N;   // total seq_len for all keys/values in the batch
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    int max_seqlen_q;
+
+    std::vector<int> host_seqstart_q;
+    std::vector<int> host_seqstart_k;
+    std::vector<int> host_seqlen_k;
+
+    float scale;
+    bool has_attn_bias;
+    bool bias_has_grad;
+
+    bool use_fp32_qkv_grad;
+    bool is_mqa_gqa;
+
+    // MHK mode strides, last-dim contiguous
+    std::array<int, 3> q_strides;
+    std::array<int, 3> k_strides;
+    std::array<int, 3> v_strides;
+    std::array<int, 3> out_strides;
+    // 4d tensor view [B, H, M, N]
+    std::array<int, 4> attn_bias_strides;
+
+    std::array<int, 3> tmp_grad_k_strides;
+    std::array<int, 3> tmp_grad_v_strides;
+
+    std::vector<const void*> q_ptrs;
+    std::vector<const void*> k_ptrs;
+    std::vector<const void*> v_ptrs;
+    std::vector<const void*> attn_bias_ptrs;
+    std::vector<const void*> grad_out_ptrs;
+    std::vector<const void*> out_ptrs;
+
+    // used by the light_v2 kernel
+    // TODO use these as workspace
+    std::vector<void*> ydotdy_ptrs;
+
+    uint8_t custom_mask_type;
+
+    std::vector<void*> grad_q_ptrs;
+    std::vector<void*> grad_k_ptrs;
+    std::vector<void*> grad_v_ptrs;
+    std::vector<void*> grad_bias_ptrs;
+
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
+
+    // BHM mode lengths, completely contiguous
+    std::vector<const void*> logsumexp_ptrs;
+
+    // TODO: need remove this after dev-op fix
+    std::vector<void*> randvals_ptrs;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
index 1b451b5f9..6c7de39ef 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <iostream>
 
 #include <torch/library.h>
@@ -5,17 +11,16 @@
 namespace {
 
 // For testing xFormers building and binding
-bool is_ck_fmha_available(double val) {
-  std::cout << "ck fmha is really here, val=" << val << std::endl;
-  return (true);
+bool is_ck_fmha_available(double val)
+{
+    std::cout << "ck fmha is really here, val=" << val << std::endl;
+    return (true);
 };
 
 } // namespace
 
-TORCH_LIBRARY_FRAGMENT(xformers, m) {
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::is_ck_fmha_available(float val) -> bool"));
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
-      TORCH_FN(is_ck_fmha_available));
+TORCH_LIBRARY_FRAGMENT(xformers, m)
+{
+    m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available(float val) -> bool"));
+    m.impl(TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"), TORCH_FN(is_ck_fmha_available));
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 5de869db0..8f26e4cee 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <cstdint>
@@ -11,99 +17,114 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
 
-#define XFORMERS_CHECK(COND, ERR)          \
-  if (!(COND)) {                           \
-    std::ostringstream ostr;               \
-    ostr << "'" #COND "' failed: " << ERR; \
-    throw std::runtime_error(ostr.str());  \
-  }
-
-#define DISPATCH_TYPES(InDataType, func)                                 \
-  {                                                                      \
-    if (InDataType == at::ScalarType::Half) {                            \
-      using scalar_t = ck::half_t;                                       \
-      func();                                                            \
-    } else if (InDataType == at::ScalarType::BFloat16) {                 \
-      using scalar_t = ck::bhalf_t;                                      \
-      func();                                                            \
-    } else {                                                             \
-      XFORMERS_CHECK(                                                    \
-          false, "Only half & bf16 input type supported at the moment"); \
-    }                                                                    \
-  }
+#define XFORMERS_CHECK(COND, ERR)              \
+    if(!(COND))                                \
+    {                                          \
+        std::ostringstream ostr;               \
+        ostr << "'" #COND "' failed: " << ERR; \
+        throw std::runtime_error(ostr.str());  \
+    }
+
+#define DISPATCH_TYPES(InDataType, func)                                                  \
+    {                                                                                     \
+        if(InDataType == at::ScalarType::Half)                                            \
+        {                                                                                 \
+            using scalar_t = ck::half_t;                                                  \
+            func();                                                                       \
+        }                                                                                 \
+        else if(InDataType == at::ScalarType::BFloat16)                                   \
+        {                                                                                 \
+            using scalar_t = ck::bhalf_t;                                                 \
+            func();                                                                       \
+        }                                                                                 \
+        else                                                                              \
+        {                                                                                 \
+            XFORMERS_CHECK(false, "Only half & bf16 input type supported at the moment"); \
+        }                                                                                 \
+    }
 
 template <typename scalar_t>
 struct CkToAtenDtype;
 
 template <>
-struct CkToAtenDtype<ck::half_t> {
-  using scalar_t = ck::half_t;
+struct CkToAtenDtype<ck::half_t>
+{
+    using scalar_t = ck::half_t;
 
-  static constexpr __host__ at::ScalarType atScalarType() {
-    return at::ScalarType::Half;
-  }
+    static constexpr __host__ at::ScalarType atScalarType() { return at::ScalarType::Half; }
 };
 
 template <>
-struct CkToAtenDtype<ck::bhalf_t> {
-  using scalar_t = ck::bhalf_t;
+struct CkToAtenDtype<ck::bhalf_t>
+{
+    using scalar_t = ck::bhalf_t;
 
-  static constexpr __host__ at::ScalarType atScalarType() {
-    return at::ScalarType::BFloat16;
-  }
+    static constexpr __host__ at::ScalarType atScalarType() { return at::ScalarType::BFloat16; }
 };
 
 template <>
-struct CkToAtenDtype<float> {
-  using scalar_t = float;
+struct CkToAtenDtype<float>
+{
+    using scalar_t = float;
 
-  static constexpr __host__ at::ScalarType atScalarType() {
-    return at::ScalarType::Float;
-  }
+    static constexpr __host__ at::ScalarType atScalarType() { return at::ScalarType::Float; }
 };
 
-#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
-  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
-  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
-  XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
-
-#define CHECK_NOSPARSE_CONTIGUOUS_CPU(TENSOR)                             \
-  XFORMERS_CHECK(TENSOR.is_cpu(), #TENSOR " must be a CPU tensor");       \
-  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
-  XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
-
-#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
-  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
-  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
-  XFORMERS_CHECK(                                                         \
-      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
-
-#define HIP_CALL_CHECK(flag)                                                 \
-  do {                                                                       \
-    hipError_t _tmpVal;                                                      \
-    if ((_tmpVal = flag) != hipSuccess) {                                    \
-      std::ostringstream ostr;                                               \
-      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
-           << hipGetErrorString(_tmpVal);                                    \
-      throw std::runtime_error(ostr.str());                                  \
-    }                                                                        \
-  } while (0)
-
-static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype) {
-  if (dtype == at::ScalarType::Float) {
-    return n * 4;
-  } else if (dtype == at::ScalarType::Half) {
-    return n * 2;
-  } else if (dtype == at::ScalarType::BFloat16) {
-    return n * 2;
-  } else if (dtype == at::ScalarType::Short) {
-    return n * 2;
-  } else if (dtype == at::ScalarType::Int) {
-    return n * 4;
-  } else if (dtype == at::ScalarType::Byte) {
-    return n;
-  }
-  return 0;
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                              \
+    XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+    XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+    XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CPU(TENSOR)                               \
+    XFORMERS_CHECK(TENSOR.is_cpu(), #TENSOR " must be a CPU tensor");       \
+    XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+    XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                          \
+    XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+    XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+    XFORMERS_CHECK(TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#define HIP_CALL_CHECK(flag)                                                       \
+    do                                                                             \
+    {                                                                              \
+        hipError_t _tmpVal;                                                        \
+        if((_tmpVal = flag) != hipSuccess)                                         \
+        {                                                                          \
+            std::ostringstream ostr;                                               \
+            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+                 << hipGetErrorString(_tmpVal);                                    \
+            throw std::runtime_error(ostr.str());                                  \
+        }                                                                          \
+    } while(0)
+
+static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype)
+{
+    if(dtype == at::ScalarType::Float)
+    {
+        return n * 4;
+    }
+    else if(dtype == at::ScalarType::Half)
+    {
+        return n * 2;
+    }
+    else if(dtype == at::ScalarType::BFloat16)
+    {
+        return n * 2;
+    }
+    else if(dtype == at::ScalarType::Short)
+    {
+        return n * 2;
+    }
+    else if(dtype == at::ScalarType::Int)
+    {
+        return n * 4;
+    }
+    else if(dtype == at::ScalarType::Byte)
+    {
+        return n;
+    }
+    return 0;
 }
 
 /**
@@ -117,36 +138,27 @@ static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype) {
  * expand the bias as needed - be careful to only create a view with different
  * shape/strides, no copies allowed.
  */
-inline at::Tensor get_bias_4d_view(
-    const at::Tensor& bias,
-    int batch_sz,
-    int n_heads,
-    int n_queries,
-    int n_keys) {
-  TORCH_CHECK(
-      bias.size(-2) == n_queries,
-      "bias.size(-2) != n_queries: ",
-      bias.size(-2),
-      " != ",
-      n_queries);
-  TORCH_CHECK(
-      bias.size(-1) == n_keys,
-      "bias.size(-1) != n_keys: ",
-      bias.size(-1),
-      " != ",
-      n_keys);
-  switch (bias.dim()) {
+inline at::Tensor
+get_bias_4d_view(const at::Tensor& bias, int batch_sz, int n_heads, int n_queries, int n_keys)
+{
+    TORCH_CHECK(bias.size(-2) == n_queries,
+                "bias.size(-2) != n_queries: ",
+                bias.size(-2),
+                " != ",
+                n_queries);
+    TORCH_CHECK(
+        bias.size(-1) == n_keys, "bias.size(-1) != n_keys: ", bias.size(-1), " != ", n_keys);
+    switch(bias.dim())
+    {
     case 2: // (n_queries, n_keys) - broadcast across all batches and heads
-      return bias.unsqueeze(0).unsqueeze(0).expand(
-          {batch_sz, n_heads, n_queries, n_keys});
+        return bias.unsqueeze(0).unsqueeze(0).expand({batch_sz, n_heads, n_queries, n_keys});
     case 3: // (batch_sz * n_heads, n_queries, n_keys) - just reshape
-      TORCH_CHECK(bias.size(0) == batch_sz * n_heads);
-      return bias.view({batch_sz, n_heads, n_queries, n_keys});
+        TORCH_CHECK(bias.size(0) == batch_sz * n_heads);
+        return bias.view({batch_sz, n_heads, n_queries, n_keys});
     case 4: // (batch_sz, n_heads, n_queries, n_keys) - do nothing
-      TORCH_CHECK(bias.size(0) == batch_sz);
-      TORCH_CHECK(bias.size(1) == n_heads)
-      return bias;
-    default:
-      TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
-  }
+        TORCH_CHECK(bias.size(0) == batch_sz);
+        TORCH_CHECK(bias.size(1) == n_heads)
+        return bias;
+    default: TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
+    }
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 5fd39201e..1a3d0fd65 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <optional>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 6dc443a7f..873d6b093 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <ck/utility/data_type.hpp>
 #include <stdexcept>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index b4cbdbce2..ff91b9fa6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <ck/tile_program/block_tile/block_masking_specialization.hpp>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 41eb3f748..29c13540a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <optional>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
index 2289b09db..72c1c4a9b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include "ck/utility/common_header.hpp"
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
index 5d95c96f7..7a3ab882f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include "ck/utility/common_header.hpp"
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index e1ad7b1a8..ba684f154 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <optional>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 659fd286b..eda9a6462 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <ck/utility/data_type.hpp>
 #include <stdexcept>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index e07f711ac..0a988b6b2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -1,207 +1,219 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #pragma once
 
 #include <array>
 #include <cstdint>
 
-struct BatchedInferParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-
-  // BMHK mode strides
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> out_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* out_ptr;
+struct BatchedInferParams
+{
+    int B;   // batch size
+    int M;   // seq_len for Query
+    int N;   // seq_len for Key and Value
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    float scale;
+    bool has_attn_bias;
+
+    // BMHK mode strides
+    std::array<int, 4> q_strides;
+    std::array<int, 4> k_strides;
+    std::array<int, 4> v_strides;
+    std::array<int, 4> out_strides;
+    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* attn_bias_ptr;
+
+    uint8_t custom_mask_type;
+
+    void* out_ptr;
 };
 
-struct BatchedForwardParams : public BatchedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
+struct BatchedForwardParams : public BatchedInferParams
+{
+    bool use_dropout;
+    bool compute_logsumexp;
 
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
 
-  // completely contiguous
-  void* logsumexp_ptr;
+    // completely contiguous
+    void* logsumexp_ptr;
 };
 
-struct GroupedInferParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
+struct GroupedInferParams
+{
+    int num_batches;
+    int M;   // total seq_len for all queries in the batch
+    int N;   // total seq_len for all keys/values in the batch
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
 
-  int max_seqlen_q;
+    int max_seqlen_q;
 
-  void* seqstart_q_dev_ptr;
-  void* seqstart_k_dev_ptr;
-  void* seqlen_k_dev_ptr;
+    void* seqstart_q_dev_ptr;
+    void* seqstart_k_dev_ptr;
+    void* seqlen_k_dev_ptr;
 
-  float scale;
-  bool has_attn_bias;
+    float scale;
+    bool has_attn_bias;
 
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
+    // MHK mode strides, last-dim contiguous
+    std::array<int, 3> q_strides;
+    std::array<int, 3> k_strides;
+    std::array<int, 3> v_strides;
+    std::array<int, 3> out_strides;
 
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
+    // 4d tensor view [B, H, M, N]
+    std::array<int, 4> attn_bias_strides;
 
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* attn_bias_ptr;
 
-  uint8_t custom_mask_type;
+    uint8_t custom_mask_type;
 
-  void* out_ptr;
+    void* out_ptr;
 };
 
-struct GroupedForwardParams : public GroupedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
+struct GroupedForwardParams : public GroupedInferParams
+{
+    bool use_dropout;
+    bool compute_logsumexp;
 
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
 
-  // completely contiguous
-  std::vector<void*> logsumexp_ptrs;
+    // completely contiguous
+    std::vector<void*> logsumexp_ptrs;
 
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
+    // TODO: need remove this after dev-op fix
+    std::vector<void*> randvals_ptrs;
 };
 
-struct BatchedBackwardParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-  bool bias_has_grad;
-
-  bool use_fp32_qkv_grad;
-  bool is_mqa_gqa;
-
-  // BMHK mode strides, last-dim contiguous
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-  std::array<int, 4> out_strides;
-
-  std::array<int, 4> tmp_grad_k_strides;
-  std::array<int, 4> tmp_grad_v_strides;
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-  const void* grad_out_ptr;
-  const void* out_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* grad_q_ptr;
-  void* grad_k_ptr;
-  void* grad_v_ptr;
-  void* grad_bias_ptr;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // BHM mode lengths, completely contiguous
-  const void* logsumexp_ptr;
+struct BatchedBackwardParams
+{
+    int B;   // batch size
+    int M;   // seq_len for Query
+    int N;   // seq_len for Key and Value
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    float scale;
+    bool has_attn_bias;
+    bool bias_has_grad;
+
+    bool use_fp32_qkv_grad;
+    bool is_mqa_gqa;
+
+    // BMHK mode strides, last-dim contiguous
+    std::array<int, 4> q_strides;
+    std::array<int, 4> k_strides;
+    std::array<int, 4> v_strides;
+    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+    std::array<int, 4> out_strides;
+
+    std::array<int, 4> tmp_grad_k_strides;
+    std::array<int, 4> tmp_grad_v_strides;
+
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* attn_bias_ptr;
+    const void* grad_out_ptr;
+    const void* out_ptr;
+
+    uint8_t custom_mask_type;
+
+    void* grad_q_ptr;
+    void* grad_k_ptr;
+    void* grad_v_ptr;
+    void* grad_bias_ptr;
+
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
+
+    // BHM mode lengths, completely contiguous
+    const void* logsumexp_ptr;
 };
 
-struct GroupedBackwardParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  int max_seqlen_q;
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-  bool bias_has_grad;
-
-  bool use_fp32_qkv_grad;
-  bool is_mqa_gqa;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::array<int, 3> tmp_grad_k_strides;
-  std::array<int, 3> tmp_grad_v_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<const void*> grad_out_ptrs;
-  std::vector<const void*> out_ptrs;
-
-  // used by the light_v2 kernel
-  // TODO use these as workspace
-  std::vector<void*> ydotdy_ptrs;
-
-  uint8_t custom_mask_type;
-
-  std::vector<void*> grad_q_ptrs;
-  std::vector<void*> grad_k_ptrs;
-  std::vector<void*> grad_v_ptrs;
-  std::vector<void*> grad_bias_ptrs;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // BHM mode lengths, completely contiguous
-  std::vector<const void*> logsumexp_ptrs;
-
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
+struct GroupedBackwardParams
+{
+    int num_batches;
+    int M;   // total seq_len for all queries in the batch
+    int N;   // total seq_len for all keys/values in the batch
+    int Hq;  // number of heads for Query
+    int Hkv; // number of heads for Key and Value
+    int K;   // embed_dim for Query and Key
+    int Kv;  // embed_dim for Value
+
+    int max_seqlen_q;
+
+    std::vector<int> host_seqstart_q;
+    std::vector<int> host_seqstart_k;
+    std::vector<int> host_seqlen_k;
+
+    float scale;
+    bool has_attn_bias;
+    bool bias_has_grad;
+
+    bool use_fp32_qkv_grad;
+    bool is_mqa_gqa;
+
+    // MHK mode strides, last-dim contiguous
+    std::array<int, 3> q_strides;
+    std::array<int, 3> k_strides;
+    std::array<int, 3> v_strides;
+    std::array<int, 3> out_strides;
+    // 4d tensor view [B, H, M, N]
+    std::array<int, 4> attn_bias_strides;
+
+    std::array<int, 3> tmp_grad_k_strides;
+    std::array<int, 3> tmp_grad_v_strides;
+
+    std::vector<const void*> q_ptrs;
+    std::vector<const void*> k_ptrs;
+    std::vector<const void*> v_ptrs;
+    std::vector<const void*> attn_bias_ptrs;
+    std::vector<const void*> grad_out_ptrs;
+    std::vector<const void*> out_ptrs;
+
+    // used by the light_v2 kernel
+    // TODO use these as workspace
+    std::vector<void*> ydotdy_ptrs;
+
+    uint8_t custom_mask_type;
+
+    std::vector<void*> grad_q_ptrs;
+    std::vector<void*> grad_k_ptrs;
+    std::vector<void*> grad_v_ptrs;
+    std::vector<void*> grad_bias_ptrs;
+
+    float dropout_prob;
+    int64_t philox_seed;
+    int64_t philox_offset;
+
+    // BHM mode lengths, completely contiguous
+    std::vector<const void*> logsumexp_ptrs;
+
+    // TODO: need remove this after dev-op fix
+    std::vector<void*> randvals_ptrs;
 };
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
index 8eb17a9f9..36e9cf24d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
index 670398c1e..a44c7f83a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
index 1dbab2746..2c6fa3f58 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
index ba06daf03..8ea38c8b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
index 97b4eb36a..8dfa5aaae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
index 8458f70ae..fbbbc2d61 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
index d7b92c451..66a2acb12 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
index 1c1167c58..59dcd373b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
index 9dbae4cac..29f9ea02d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
index f38a2c7b8..4bf813296 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
index 522e2951a..ec12b66c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
index 041e4d4df..947faaa83 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
index bc9a2948d..a1e22812a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
index e654ca13a..de7ee388b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
index 4a2376a72..de45cee54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
index 66765de59..d0e3c83c8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
index 9609900d2..0a125b480 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
index aa4d7ff70..511598a23 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
index 72715c6dc..bb6ba7b58 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
index 7e6245db4..e260e288c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -1,10 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include <stdexcept>
 
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
index d2707dde7..8f7501252 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
index 598db5503..47cb68b98 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
index 28640755d..34b331814 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
index d3922d621..9a46d6678 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
+    BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
index 140cffce0..0027e6fa6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
+                                                                        hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
index bb32b63ef..01b4ab6a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
index 6ba23b3a2..fee6af685 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
+                                                                        hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
index 400df0b3d..3b22467b8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
index a99486148..0964fea9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
+                                                                        hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
index 23305b07a..9ddde1484 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
index a9dd771de..4e47a02b8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
index f653451ab..a99e2cf17 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
index 5ca4b7dda..b0617fe73 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
index f9af4528d..d00e4e2ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
index 44e98d9a3..6a2215ae0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
index 8dfc288f8..43dc7c78f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
index 9748955e1..11c575371 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
index 418f925c2..6ed03ba3b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
index a7cdb48b8..cbb2f1e37 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
index 578855b9b..e53d44ff4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
index 35e9bca9c..96454b7d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
index e27e3b5ff..ecfd4bd2e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
index 5c83b0abd..b73d06a5c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
index 11c76b35f..3ebf195d7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
index b13f5a4c9..1f56500ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
index 12f5991c4..2cbb237cc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
index 8d45859e5..441520157 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
index 9f03be2b5..5e9d21dac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_batched_infer.h"
 
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
index 82d7b1f00..517b6ab08 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
index 2327c6c3c..eeb4ba125 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
index 945a91a99..179dadebc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
index ea443ab4b..3b604cd00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
index daa0dc1c7..07ec9e671 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
index b8273b2d6..b23b68e21 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
index 6496bca76..2c5cf0189 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
index d2cf1d5df..3dbf05b04 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
index 7ae9b06f5..765eb7fd2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
index 13a1bd476..9eae79997 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
index 01d292154..2d85adcdc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
index 22ec35865..325adcf28 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
index ad20325d7..23c7f7360 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
index 3ca75bc61..f5095f9e0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
index cd9bd1689..d893d066c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
index 8cbdcc253..b81c731c6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
index 2241fb932..5d79dc7a9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
index b82218a58..8ca3fc15b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
index 914b28d27..28cfd91f0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
index c1eef0cec..e7974599b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
index d97a398ee..f7c6bab6b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
index 5d21721d3..389b8ef6b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
index 0cfac6111..cf6edccb5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
index 551a46c9c..fc2e60a47 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -1,8 +1,11 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
+    GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
index bfde13c7d..4d473f7b9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
+                                                                        hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
index 85e853c36..4b64703b2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
index d86afa1aa..ed5a11c66 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
+                                                                        hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
index dd58b5b28..4ecf75691 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
index 085245c08..af22c6c13 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
+                                                                        hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
index 8c3ea29a4..2aa5b9431 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
index 19adc3971..efaa2ee52 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
index 6da5508d3..7394b8b72 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
index f97de6fb3..3b7732cb0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
index 5bd33901b..a4db70fcf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
index 155c9eb6c..c19f683b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
+                                                                       hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
index 29f3ed1a3..2e10db88a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
@@ -1,7 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
index 973213413..3c012adbf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
index 96e0ba425..f19c5a4e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
index 332724e73..b12476dad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
index cb1120f5b..ab0141e0d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
index 51ed70cab..546074138 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
index c157e89c1..9b65ff186 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
index bbcd3ab0e..3e8a0eb75 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
index e320f5de6..92879082c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
index e763dde6a..37137dc97 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
index 3ec2d41da..3ea5affe8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
index dee7a0845..33f2bc7f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
index b5515e9a0..27eea7bac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,8 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_fmha_grouped_infer.h"
 
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
index 8f4c31ab3..5c9d5a113 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_batched_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
index 783fb5e16..22ba1cbf0 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_batched_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
index 7be550de2..a788c0e4b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_batched_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
index 9276ca53f..f9d551e6e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_batched_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
index da3f5004e..daa204ebd 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_batched_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
index 189d295d2..11ab6765f 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_batched_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
index 100150751..e40ffafc3 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_grouped_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
index 3b323b7bb..537e59bd1 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_grouped_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
index 6fad32f78..919c73a4a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_grouped_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
index 39646e941..17da13db7 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_grouped_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
index ba5384e43..e5d08e589 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_grouped_infer.h"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
index f6e4a4215..e78118baf 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <ck/ck.hpp>
 
 #include "ck_tiled_fmha_grouped_infer.h"

From bbdb8e70651df4f8c5a33b2700400c41eb6914b2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Dec 2023 15:10:54 +0000
Subject: [PATCH 254/641] Update to tests/test_forward_ck_tiled.py

---
 tests/test_forward_ck_tiled.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index f295887e9..3c5419525 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -576,10 +576,6 @@ def test_forward(
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if bias_type is not None and bias_type is not type(None):
-        if bias_type is not torch.Tensor and bias_type is not fmha.attn_bias.BlockDiagonalMask:
-            pytest.skip("only three bias types are supported by ck-tiled!")
-
     if dtype is torch.bfloat16:
         pytest.skip("bfloat16 is currently not supported by ck-tiled!")
 

From ff48957a23160e4490d90fa1af75ee6b49db09de Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Dec 2023 15:35:55 +0000
Subject: [PATCH 255/641] Synchronize the latest
 third_party/composable_kernel_tiled and update .gitmodules

---
 .gitmodules                                   |  4 ++++
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 21 +++++++++++--------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  8 +++----
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 94eb8135c..bf2678053 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,3 +8,7 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
+[submodule "third_party/composable_kernel_tiled"]
+	path = third_party/composable_kernel_tiled
+	url = https://github.com/asroy/ck_tile
+        branch = feature/fmha-pad-support
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 1a3d0fd65..336228f6f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -23,6 +23,7 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
@@ -87,7 +88,7 @@ struct batched_infer_masktype_attnbias_dispatched
     }()
 #endif
 
-    template <bool kM0NeedPadding, bool kN0K1NeedPadding, typename FmhaShape>
+    template <typename FmhaTraits, typename FmhaShape>
     using FmhaPipelineProblemTemp =
         ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
                                                           KDataType,
@@ -101,10 +102,8 @@ struct batched_infer_masktype_attnbias_dispatched
                                                           256, // BlockSize
                                                           FmhaShape,
                                                           false, // kIsGroupMode
-                                                          kM0NeedPadding,
-                                                          kN0K1NeedPadding,
-                                                          has_attn_bias,
-                                                          FmhaCausalMask>;
+                                                          FmhaCausalMask,
+                                                          FmhaTraits>;
 
     static void Run(BatchedForwardParams& param, hipStream_t stream)
     {
@@ -113,7 +112,8 @@ struct batched_infer_masktype_attnbias_dispatched
 
             if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
             {
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<false, false, FmhaShape>;
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<false, false, has_attn_bias>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
                 using FmhaPipeline =
                     ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
                 using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
@@ -122,7 +122,8 @@ struct batched_infer_masktype_attnbias_dispatched
             }
             else if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 != 0)
             {
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<false, true, FmhaShape>;
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<false, true, has_attn_bias>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
                 using FmhaPipeline =
                     ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
                 using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
@@ -131,7 +132,8 @@ struct batched_infer_masktype_attnbias_dispatched
             }
             else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 == 0)
             {
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<true, false, FmhaShape>;
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<true, false, has_attn_bias>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
                 using FmhaPipeline =
                     ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
                 using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
@@ -140,7 +142,8 @@ struct batched_infer_masktype_attnbias_dispatched
             }
             else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 != 0)
             {
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<true, true, FmhaShape>;
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
                 using FmhaPipeline =
                     ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
                 using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index ba684f154..89b4348f3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -24,6 +24,7 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
@@ -97,6 +98,7 @@ struct grouped_infer_masktype_attnbias_dispatched
     {
         GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
             using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+            using FmhaTraits          = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
             using FmhaPipelineProblem =
                 ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
                                                                   KDataType,
@@ -110,10 +112,8 @@ struct grouped_infer_masktype_attnbias_dispatched
                                                                   256, // BlockSize
                                                                   FmhaShape,
                                                                   true, // IsGroupMode
-                                                                  true, // kM0NeedPadding
-                                                                  true, // kN0K1Needpadding
-                                                                  has_attn_bias,
-                                                                  FmhaCausalMask>;
+                                                                  FmhaCausalMask,
+                                                                  FmhaTraits>;
 
             using FmhaPipeline =
                 ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;

From 85b757783e6339c25dfefced512b767e407b5720 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:43:20 -0500
Subject: [PATCH 256/641] flatten block index

---
 .../hip_fmha/attention_forward_decoder.cpp    |  22 +-
 .../hip_fmha/ck_attention_forward_decoder.h   | 602 ++++++++----------
 2 files changed, 280 insertions(+), 344 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index da14882f7..a5c2f2796 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -85,7 +85,7 @@ efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B
     TORCH_CHECK(M <= 1024);
     TORCH_CHECK(H <= 1024);
 
-    dim3 blocks(B, H, M);
+    dim3 blocks(B * H * M);
     dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
     int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
@@ -125,8 +125,10 @@ efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B
                 K_acc.stride(0),
                 K_acc.stride(1),
                 K_acc.stride(2),
+                XQ_acc.size(1),
+                XQ_acc.size(2),
+                XQ_acc.size(3),
                 K_acc.size(1),
-                K_acc.size(3),
                 K_acc.size(2) == 1,
                 qk_scale,
                 blocks,
@@ -248,14 +250,14 @@ int main(int argc, char** argv)
                       << std::endl;
             return 0;
         }
-        const int32_t n_keys     = std::stoi(args[0]);
-        const int32_t padding    = std::stoi(args[1]);
-        const int32_t batch_size = std::stoi(args[2]);
-        const int32_t n_heads    = std::stoi(args[3]);
-        const int32_t multiquery = (args[4] == "mq");
-        const auto dtype         = (args[5] == "f32")
-                               ? torch::kFloat32
-                               : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+        const int32_t n_keys                 = std::stoi(args[0]);
+        const int32_t padding                = std::stoi(args[1]);
+        const int32_t batch_size             = std::stoi(args[2]);
+        const int32_t n_heads                = std::stoi(args[3]);
+        const int32_t multiquery             = (args[4] == "mq");
+        const auto dtype                     = (args[5] == "f32")   ? torch::kFloat32
+                                               : (args[5] == "f16") ? torch::kFloat16
+                                                                    : torch::kBFloat16;
         const int32_t n_wavefronts_per_block = std::stoi(args[6]);
 
         const int32_t dim_per_head = 4 * kThreadsPerWavefront;
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 7b39a2c54..5686ad4b7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -114,27 +114,29 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                                               const scalar_t* __restrict__ cache_V,
                                               scalar_t* __restrict__ O,
                                               const int32_t* __restrict__ seq_kv_lens,
-                                              const ptrdiff_t XQ_stride_0,
-                                              const ptrdiff_t XQ_stride_1,
-                                              const ptrdiff_t XQ_stride_2,
-                                              const ptrdiff_t K_stride_0,
-                                              const ptrdiff_t K_stride_1,
-                                              const ptrdiff_t K_stride_2,
-                                              const int32_t K_size_1,
-                                              const int32_t D_H,
+                                              const ptrdiff_t XQ_stride_b,
+                                              const ptrdiff_t XQ_stride_m,
+                                              const ptrdiff_t XQ_stride_h,
+                                              const ptrdiff_t K_stride_b,
+                                              const ptrdiff_t K_stride_m,
+                                              const ptrdiff_t K_stride_h,
+                                              const int32_t Q_size_m,
+                                              const int32_t Q_size_h,
+                                              const int32_t Q_size_k,
+                                              const int32_t K_size_m,
                                               const bool multiquery,
                                               const float qk_scale)
 {
     static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
     // Each block handles a single batch and head and query
-    const int32_t b = blockIdx.x;
-    const int32_t h = blockIdx.y;
-    const int32_t m = blockIdx.z;
+    const int32_t b = blockIdx.x / (Q_size_m * Q_size_h);
+    const int32_t h = (blockIdx.x / Q_size_m) % Q_size_h;
+    const int32_t m = blockIdx.x % Q_size_m;
 
     // Note: this is decoding case where we attend to current and all previous
     // tokens.
-    const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_1;
+    const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
 
     const int32_t lane_idx              = threadIdx.x;
     const int32_t wavefront_idx         = threadIdx.y;
@@ -143,10 +145,10 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
     const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
     const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
     // const auto* q_ = &(XQ_acc[b][m][h][0]);
-    const auto XQO_base_offset  = b * XQ_stride_0 + m * XQ_stride_1 + h * XQ_stride_2;
+    const auto XQO_base_offset  = b * XQ_stride_b + m * XQ_stride_m + h * XQ_stride_h;
     const auto* __restrict__ q_ = XQ + XQO_base_offset;
 
-    const auto cache_KV_base_offset       = b * K_stride_0 + (multiquery ? 0 : h * K_stride_2);
+    const auto cache_KV_base_offset       = b * K_stride_b + (multiquery ? 0 : h * K_stride_h);
     const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
     const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
 
@@ -158,7 +160,7 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
     using compute_t     = float;
     using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
 
-    const bool lane_active_for_io = lane_idx * vec_size < D_H;
+    const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
 
     extern __shared__ __align__(16) compute_t smem[];
 
@@ -188,344 +190,276 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
             {
                 const int32_t t = tt + ttt;
                 // load the K[b][t][h|0][:] row into registers
-                load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+                load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
             }
         }
-        compute_t qk_accs[n_loop_unroll] = {};
-#pragma unroll n_loop_unroll
-        for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-        {
-            ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                q_thread, k_loads[ttt], qk_accs[ttt]);
-            qk_accs[ttt] *= qk_scale;
+        // Each block computes different B value
+        compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
-            qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
-            max_qk_acc   = ck::math::max(qk_accs[ttt], max_qk_acc);
-        }
-        if(lane_idx == 0)
-        {
-            auto* __restrict__ smem_base = smem + tt;
-#pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                smem_base[ttt] = qk_accs[ttt];
-            }
-        }
-    }
+        // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+        // Split T across wavefronts in a block, unroll loads to expose more
+        // parallelism.
 
-    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-    for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-        tt += wavefronts_per_block * n_loop_unroll_tail)
-    {
-        if(lane_active_for_io)
+        // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+            tt += wavefronts_per_block * n_loop_unroll_tail)
         {
-#pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            if(lane_active_for_io)
             {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    // load the K[b][t][h|0][:] row into registers
-                    load_v<data_t, data_vec_t>(
-                        cache_K_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-                }
-            }
-        }
 #pragma unroll n_loop_unroll_tail
-        for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-        {
-            compute_t qk_acc = 0;
-            const int32_t t  = tt + ttt;
-            if(t < t_max)
-            {
-                ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                    q_thread, k_loads[ttt], qk_acc);
-                qk_acc *= qk_scale;
-
-                qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-                max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
-
-                // write accumulated sums to smem.
-                if(lane_idx == 0)
+                for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
                 {
-                    smem[t] = qk_acc;
-                }
-            }
-        }
-    }
-
-    // Use shared reduction to compute max and compute softmax on shared memory.
-    // write max acc
-    if(lane_idx == 0)
-    {
-        smem[T_MAX + wavefront_idx] = max_qk_acc;
-    }
-    __syncthreads();
-    if(lane_idx < wavefronts_per_block)
-    {
-        max_qk_acc = ck::math::max(max_qk_acc, smem[T_MAX + lane_idx]);
-    }
-    // shared across all threads in block
-    max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
-
-    // each wavefront computes partial sum of exp.
-    compute_t softmax_denominator = 0.0f;
-    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
-    {
-        softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
-    }
-    softmax_denominator =
-        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
-
-    if(lane_idx == 0)
-    {
-        smem[T_MAX + wavefront_idx] = softmax_denominator;
-    }
-    __syncthreads();
-
-    // now, compute sum of exp(x - max(x)) over all intermediate results.
-    softmax_denominator = 0.0;
-    if(lane_idx < wavefronts_per_block)
-    {
-        softmax_denominator = smem[T_MAX + lane_idx];
-    }
-    softmax_denominator =
-        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
-
-    const compute_t softmax_scale_factor = 1. / softmax_denominator;
-    // now, compute the normalization across all threads.
-    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
-    {
-        smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
-    }
-    __syncthreads();
-
-    // Split T across wavefronts in a block
-    // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-    // outputs are of size float[D]
-
-    compute_t ps[n_loop_unroll] = {};
-    compute_vec_t o_acc         = 0;
-    if(lane_active_for_io)
-    {
-        for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
-        {
+                    const int32_t t = tt + ttt;
+                    if(t < t_max)
+                    {
+                        // load the K[b][t][h|0][:] row into registers
+                        load_v<data_t, data_vec_t>(
+                            cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                    }
+                    compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                // load the V[b][t][h|0][:] row into registers, reusing K register
-                // storage
-                load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-                ps[ttt] = smem[t];
-            }
+                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+                    {
+                        const int32_t t = tt + ttt;
+                        // load the V[b][t][h|0][:] row into registers, reusing K register
+                        // storage
+                        load_v<data_t, data_vec_t>(
+                            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                        ps[ttt] = smem[t];
+                    }
 
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-            }
-        }
-
-        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-            tt += wavefronts_per_block * n_loop_unroll_tail)
-        {
-#pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    // load the V[b][t][h|0][:] row into registers, reusing K register
-                    // storage
-                    load_v<data_t, data_vec_t>(
-                        cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
-                    ps[ttt] = smem[t];
+                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+                    {
+                        o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+                    }
                 }
-            }
 
-#pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
+                for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+                    tt += wavefronts_per_block * n_loop_unroll_tail)
                 {
-                    o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-                }
-            }
-        }
-    }
-    // now, each thread has partial sums. Write to smem and get accumulated
-    // results back.
-    __syncthreads();
-
-    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
-    if(lane_active_for_io)
-    {
-        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
-    }
+#pragma unroll n_loop_unroll_tail
+                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+                    {
+                        const int32_t t = tt + ttt;
+                        if(t < t_max)
+                        {
+                            // load the V[b][t][h|0][:] row into registers, reusing K register
+                            // storage
+                            load_v<data_t, data_vec_t>(
+                                cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                            ps[ttt] = smem[t];
+                        }
+
+                        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+                            tt += wavefronts_per_block * n_loop_unroll_tail)
+                        {
+#pragma unroll n_loop_unroll_tail
+                            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+                            {
+                                const int32_t t = tt + ttt;
+                                if(t < t_max)
+                                {
+                                    // load the V[b][t][h|0][:] row into registers, reusing K
+                                    // register storage
+                                    load_v<data_t, data_vec_t>(
+                                        cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+                                    ps[ttt] = smem[t];
+                                }
+                            }
 
-    __syncthreads();
-    // sum up partial D rows from other wavefronts
-    if(wavefront_idx == 0 && lane_active_for_io)
-    {
-        union
-        {
-            compute_vec_t vec = 0;
-            compute_t arr[vec_size];
-        } r;
-        for(int32_t w = 0; w < wavefronts_per_block; ++w)
-        {
-            compute_vec_t partial_r;
-            load_v<compute_t, compute_vec_t>(
-                smem, w * threads_per_wavefront + lane_idx, &partial_r);
-            r.vec += partial_r;
-        }
-        // elementwise convert from compute_t result to data_t out to be written
-        union
-        {
-            data_vec_t vec;
-            data_t arr[vec_size];
-        } bf_r;
+#pragma unroll n_loop_unroll_tail
+                            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+                            {
+                                const int32_t t = tt + ttt;
+                                if(t < t_max)
+                                {
+                                    o_acc = scalar_scale_acc<data_t, vec_size>(
+                                        o_acc, k_loads[ttt], ps[ttt]);
+                                }
+                            }
+                        }
+                    }
+                    // now, each thread has partial sums. Write to smem and get accumulated
+                    // results back.
+                    __syncthreads();
+
+                    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+                    if(lane_active_for_io)
+                    {
+                        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+                    }
+
+                    __syncthreads();
+                    // sum up partial D rows from other wavefronts
+                    if(wavefront_idx == 0 && lane_active_for_io)
+                    {
+                        union
+                        {
+                            compute_vec_t vec = 0;
+                            compute_t arr[vec_size];
+                        } r;
+                        for(int32_t w = 0; w < wavefronts_per_block; ++w)
+                        {
+                            compute_vec_t partial_r;
+                            load_v<compute_t, compute_vec_t>(
+                                smem, w * threads_per_wavefront + lane_idx, &partial_r);
+                            r.vec += partial_r;
+                        }
+                        // elementwise convert from compute_t result to data_t out to be written
+                        union
+                        {
+                            data_vec_t vec;
+                            data_t arr[vec_size];
+                        } bf_r;
 #pragma unroll
-        for(int32_t i = 0; i < vec_size; ++i)
-        {
-            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
-        }
-        // write output row O[b][m][h][:]
-        data_t* __restrict__ o_ = O + XQO_base_offset;
-        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
-    }
-}
-
-} // namespace
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-template <typename scalar_t>
-struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
-{
-    using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-    struct Argument : public BaseArgument
-    {
-        const scalar_t* __restrict__ XQ;
-        const scalar_t* __restrict__ cache_K;
-        const scalar_t* __restrict__ cache_V;
-        scalar_t* __restrict__ O;
-        const int32_t* __restrict__ seq_kv_lens;
-        const ptrdiff_t XQ_stride_0;
-        const ptrdiff_t XQ_stride_1;
-        const ptrdiff_t XQ_stride_2;
-        const ptrdiff_t K_stride_0;
-        const ptrdiff_t K_stride_1;
-        const ptrdiff_t K_stride_2;
-        const int32_t K_size_1;
-        const int32_t D_H;
-        const bool multiquery;
-        const float qk_scale;
-
-        const dim3 grid_dim;
-        const dim3 block_dim;
-        const size_t lds_bytes;
-
-        Argument(const scalar_t* __restrict__ XQ,
-                 const scalar_t* __restrict__ cache_K,
-                 const scalar_t* __restrict__ cache_V,
-                 scalar_t* __restrict__ O,
-                 const int32_t* __restrict__ seq_kv_lens,
-                 const ptrdiff_t XQ_stride_0,
-                 const ptrdiff_t XQ_stride_1,
-                 const ptrdiff_t XQ_stride_2,
-                 const ptrdiff_t K_stride_0,
-                 const ptrdiff_t K_stride_1,
-                 const ptrdiff_t K_stride_2,
-                 const int32_t K_size_1,
-                 const int32_t D_H,
-                 const bool multiquery,
-                 const float qk_scale,
-                 const dim3 grid_dim,
-                 const dim3 block_dim,
-                 const size_t lds_bytes)
-            : XQ(XQ),
-              cache_K(cache_K),
-              cache_V(cache_V),
-              O(O),
-              seq_kv_lens(seq_kv_lens),
-              XQ_stride_0(XQ_stride_0),
-              XQ_stride_1(XQ_stride_1),
-              XQ_stride_2(XQ_stride_2),
-              K_stride_0(K_stride_0),
-              K_stride_1(K_stride_1),
-              K_stride_2(K_stride_2),
-              K_size_1(K_size_1),
-              D_H(D_H),
-              multiquery(multiquery),
-              qk_scale(qk_scale),
-              grid_dim(grid_dim),
-              block_dim(block_dim),
-              lds_bytes(lds_bytes)
-        {
-        }
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            auto threads_per_wavefront = arg.block_dim.x;
-
-            auto D_H_alignment_necessary = 0;
-
-            for(auto vec_size : {4, 2, 1})
-            {
-                if(arg.D_H <= vec_size * threads_per_wavefront)
-                {
-                    D_H_alignment_necessary = vec_size;
+                        for(int32_t i = 0; i < vec_size; ++i)
+                        {
+                            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+                        }
+                        // write output row O[b][m][h][:]
+                        data_t* __restrict__ o_ = O + XQO_base_offset;
+                        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+                    }
                 }
-            }
 
-            if(!D_H_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported D_H");
-            }
+            } // namespace
 
-            if(arg.D_H % D_H_alignment_necessary)
+            namespace ck {
+            namespace tensor_operation {
+            namespace device {
+            template <typename scalar_t>
+            struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
             {
-                throw std::runtime_error("Unsupported alignment for D_H");
-            }
-
-            return launch_and_time_kernel(
-                stream_config,
-                D_H_alignment_necessary == 4
-                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-                    : D_H_alignment_necessary == 2
-                          ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                          : D_H_alignment_necessary == 1
+                using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+                struct Argument : public BaseArgument
+                {
+                    const scalar_t* __restrict__ XQ;
+                    const scalar_t* __restrict__ cache_K;
+                    const scalar_t* __restrict__ cache_V;
+                    scalar_t* __restrict__ O;
+                    const int32_t* __restrict__ seq_kv_lens;
+                    const ptrdiff_t XQ_stride_b;
+                    const ptrdiff_t XQ_stride_m;
+                    const ptrdiff_t XQ_stride_h;
+                    const ptrdiff_t K_stride_b;
+                    const ptrdiff_t K_stride_m;
+                    const ptrdiff_t K_stride_h;
+                    const int32_t Q_size_m;
+                    const int32_t Q_size_h;
+                    const int32_t Q_size_k;
+                    const int32_t K_size_m;
+                    const bool multiquery;
+                    const float qk_scale;
+
+                    const dim3 grid_dim;
+                    const dim3 block_dim;
+                    const size_t lds_bytes;
+
+                    Argument(const scalar_t* __restrict__ XQ,
+                             const scalar_t* __restrict__ cache_K,
+                             const scalar_t* __restrict__ cache_V,
+                             scalar_t* __restrict__ O,
+                             const int32_t* __restrict__ seq_kv_lens,
+                             const ptrdiff_t XQ_stride_b,
+                             const ptrdiff_t XQ_stride_m,
+                             const ptrdiff_t XQ_stride_h,
+                             const ptrdiff_t K_stride_b,
+                             const ptrdiff_t K_stride_m,
+                             const ptrdiff_t K_stride_h,
+                             const int32_t Q_size_m,
+                             const int32_t Q_size_h,
+                             const int32_t Q_size_k,
+                             const int32_t K_size_m,
+                             const bool multiquery,
+                             const float qk_scale,
+                             const dim3 grid_dim,
+                             const dim3 block_dim,
+                             const size_t lds_bytes)
+                        : XQ(XQ),
+                          cache_K(cache_K),
+                          cache_V(cache_V),
+                          O(O),
+                          seq_kv_lens(seq_kv_lens),
+                          XQ_stride_b(XQ_stride_b),
+                          XQ_stride_m(XQ_stride_m),
+                          XQ_stride_h(XQ_stride_h),
+                          K_stride_b(K_stride_b),
+                          K_stride_m(K_stride_m),
+                          K_stride_h(K_stride_h),
+                          Q_size_m(Q_size_m),
+                          Q_size_h(Q_size_h),
+                          Q_size_k(Q_size_k),
+                          K_size_m(K_size_m),
+                          multiquery(multiquery),
+                          qk_scale(qk_scale),
+                          grid_dim(grid_dim),
+                          block_dim(block_dim),
+                          lds_bytes(lds_bytes)
+                    {
+                    }
+                };
+
+                struct Invoker : public BaseInvoker
+                {
+                    using Argument = DeviceOp::Argument;
+                    float Run(const Argument& arg,
+                              const StreamConfig& stream_config = StreamConfig{})
+                    {
+                        auto threads_per_wavefront = arg.block_dim.x;
+
+                        auto Q_size_k_alignment_necessary = 0;
+
+                        for(auto vec_size : {4, 2, 1})
+                        {
+                            if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                            {
+                                Q_size_k_alignment_necessary = vec_size;
+                            }
+                        };
+
+                        if(!Q_size_k_alignment_necessary)
+                        {
+                            throw std::runtime_error("Unsupported Q_size_k");
+                        }
+
+                        if(arg.Q_size_k % Q_size_k_alignment_necessary)
+                        {
+                            throw std::runtime_error("Unsupported alignment for Q_size_k");
+                        }
+
+                        return launch_and_time_kernel(
+                            stream_config,
+                            Q_size_k_alignment_necessary == 4
+                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+                            : Q_size_k_alignment_necessary == 2
+                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                            : Q_size_k_alignment_necessary == 1
                                 ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
                                 : nullptr,
-                arg.grid_dim,
-                arg.block_dim,
-                arg.lds_bytes,
-                arg.XQ,
-                arg.cache_K,
-                arg.cache_V,
-                arg.O,
-                arg.seq_kv_lens,
-                arg.XQ_stride_0,
-                arg.XQ_stride_1,
-                arg.XQ_stride_2,
-                arg.K_stride_0,
-                arg.K_stride_1,
-                arg.K_stride_2,
-                arg.K_size_1,
-                arg.D_H,
-                arg.multiquery,
-                arg.qk_scale);
-        }
-    };
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+                            arg.grid_dim,
+                            arg.block_dim,
+                            arg.lds_bytes,
+                            arg.XQ,
+                            arg.cache_K,
+                            arg.cache_V,
+                            arg.O,
+                            arg.seq_kv_lens,
+                            arg.XQ_stride_b,
+                            arg.XQ_stride_m,
+                            arg.XQ_stride_h,
+                            arg.K_stride_b,
+                            arg.K_stride_m,
+                            arg.K_stride_h,
+                            arg.Q_size_m,
+                            arg.Q_size_h,
+                            arg.Q_size_k,
+                            arg.K_size_m,
+                            arg.multiquery,
+                            arg.qk_scale);
+                    }
+                };
+            };
+            } // namespace device
+            } // namespace tensor_operation
+            } // namespace ck

From 0215ced6e043de1acaafabc25418d5aafaa6fe14 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:43:52 -0500
Subject: [PATCH 257/641] add helper from upstream which makes any input rank-5

---
 xformers/ops/fmha/common.py | 49 +++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index 24fdc5247..b318342aa 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -3,9 +3,10 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+from functools import partial
 import math
 from dataclasses import dataclass
-from typing import Any, List, Mapping, Optional, Set, Tuple, Type, Union
+from typing import Any, Callable, List, Mapping, Optional, Set, Tuple, Type, Union
 
 import torch
 
@@ -28,6 +29,17 @@ def _is_bias_type_supported_in_BMK(attn_bias_type: Any) -> bool:
     return False
 
 
+def _attn_bias_apply(
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]],
+    op: Callable[[torch.Tensor], torch.Tensor],
+) -> Optional[Union[torch.Tensor, AttentionBias]]:
+    if isinstance(attn_bias, torch.Tensor):
+        return op(attn_bias)
+    if isinstance(attn_bias, LowerTriangularMaskWithTensorBias):
+        return LowerTriangularMaskWithTensorBias(op(attn_bias._bias))
+    return attn_bias
+
+
 @dataclass
 class Inputs:
     """
@@ -49,14 +61,34 @@ def device(self) -> torch.device:
     def scale_float(self) -> float:
         return self.query.shape[-1] ** (-0.5) if self.scale is None else self.scale
 
+    def get_qkv_in_bmghk(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.query.ndim == 5:
+            return self.query, self.key, self.value
+        if self.query.ndim == 4:
+            return (
+                self.query.unsqueeze(2),
+                self.key.unsqueeze(2),
+                self.value.unsqueeze(2),
+            )
+        if self.value.ndim == 3:
+            return (
+                self.query[:, :, None, None],
+                self.key[:, :, None, None],
+                self.value[:, :, None, None],
+            )
+        assert False
+
     def normalize_bmhk(self) -> Tuple[int, ...]:
-        if self.query.ndim not in [3, 4]:
+        if self.query.ndim not in [3, 4, 5]:
             raise ValueError(
                 f"Invalid shape for query: {self.query.shape}. "
-                "Expected shape [batch, seqlen, num_heads, K], or [batch, seqlen, K]."
+                "Expected shape [batch, seqlen, head_groups, num_heads_per_group, K]"
+                ", [batch, seqlen, num_heads, K], or [batch, seqlen, K]."
             )
         if self.value.dtype == torch.int32:
-            # Quantized K/V case, in which the last dims of Q and K/V are different
+            # Quantized K/V case, in which the last dims of Q and K are different.
+            # NB we currently don't have any implementations for quantized KV with
+            # SUPPORTS_DIFFERENT_VALUE_EMBED.
             output_shape = tuple(self.query.shape)
         else:
             output_shape = (self.query.shape[:-1]) + (self.value.shape[-1],)
@@ -65,12 +97,9 @@ def normalize_bmhk(self) -> Tuple[int, ...]:
             self.query = self.query.unsqueeze(2)
             self.key = self.key.unsqueeze(2)
             self.value = self.value.unsqueeze(2)
-            if isinstance(self.attn_bias, torch.Tensor):
-                if self.attn_bias.ndim != 3:
-                    raise ValueError(
-                        f"Expected BMK format for attn_bias, but got {self.attn_bias.shape}"
-                    )
-                self.attn_bias = self.attn_bias.unsqueeze(1)
+            self.attn_bias = _attn_bias_apply(
+                self.attn_bias, partial(torch.unsqueeze, dim=1)
+            )
         return output_shape
 
     def validate_inputs(self) -> None:

From 8ef8fed0fc497a1367044cb10c303a74f8b0e289 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 24 Nov 2023 20:20:58 -0500
Subject: [PATCH 258/641] support bmghk

---
 .../hip_fmha/attention_forward_decoder.cpp    | 22 +++++++-----
 .../hip_fmha/ck_attention_forward_decoder.h   | 30 ++++++++++++----
 xformers/ops/fmha/ck_decoder.py               | 34 +++++++++----------
 xformers/ops/fmha/common.py                   |  6 ++--
 4 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index a5c2f2796..3678157aa 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -75,17 +75,20 @@ efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B
     TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
 
     TORCH_CHECK(cache_K.size(1) <= T_MAX);
-    TORCH_CHECK(cache_K.size(3) <= D_H);
+    TORCH_CHECK(cache_K.size(4) <= D_H);
+
+    constexpr auto rank = 5;
 
     auto B = XQ.size(0);
     auto M = XQ.size(1);
-    auto H = XQ.size(2);
+    auto G = XQ.size(2);
+    auto H = XQ.size(3);
 
     TORCH_CHECK(B <= 1024);
     TORCH_CHECK(M <= 1024);
     TORCH_CHECK(H <= 1024);
 
-    dim3 blocks(B * H * M);
+    dim3 blocks(B * H * M * G);
     dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
     int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
@@ -105,10 +108,10 @@ efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B
             using device_op_t = ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<ck_data_t>;
             auto op           = device_op_t{};
 
-            auto XQ_acc = XQ.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
-            auto K_acc  = cache_K.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-            auto V_acc  = cache_V.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>();
-            auto O_acc  = O.packed_accessor32<scalar_t, 4, at::RestrictPtrTraits>();
+            auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto K_acc  = cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+            auto V_acc  = cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+            auto O_acc  = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
             auto seq_acc =
                 seq_kv_lens
                     ? seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data()
@@ -122,14 +125,17 @@ efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B
                 XQ_acc.stride(0),
                 XQ_acc.stride(1),
                 XQ_acc.stride(2),
+                XQ_acc.stride(3),
                 K_acc.stride(0),
                 K_acc.stride(1),
                 K_acc.stride(2),
+                K_acc.stride(3),
                 XQ_acc.size(1),
                 XQ_acc.size(2),
                 XQ_acc.size(3),
+                XQ_acc.size(4),
                 K_acc.size(1),
-                K_acc.size(2) == 1,
+                K_acc.size(3) == 1,
                 qk_scale,
                 blocks,
                 threads,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 5686ad4b7..5d303f8a4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -116,11 +116,14 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                                               const int32_t* __restrict__ seq_kv_lens,
                                               const ptrdiff_t XQ_stride_b,
                                               const ptrdiff_t XQ_stride_m,
+                                              const ptrdiff_t XQ_stride_g,
                                               const ptrdiff_t XQ_stride_h,
                                               const ptrdiff_t K_stride_b,
                                               const ptrdiff_t K_stride_m,
+                                              const ptrdiff_t K_stride_g,
                                               const ptrdiff_t K_stride_h,
                                               const int32_t Q_size_m,
+                                              const int32_t Q_size_g,
                                               const int32_t Q_size_h,
                                               const int32_t Q_size_k,
                                               const int32_t K_size_m,
@@ -129,10 +132,11 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
 {
     static_assert(n_loop_unroll_tail < n_loop_unroll, "");
 
-    // Each block handles a single batch and head and query
-    const int32_t b = blockIdx.x / (Q_size_m * Q_size_h);
-    const int32_t h = (blockIdx.x / Q_size_m) % Q_size_h;
-    const int32_t m = blockIdx.x % Q_size_m;
+    // Each block handles a single batch and head and query and group
+    const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+    const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+    const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+    const int32_t h = blockIdx.x % Q_size_h;
 
     // Note: this is decoding case where we attend to current and all previous
     // tokens.
@@ -145,10 +149,12 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
     const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
     const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
     // const auto* q_ = &(XQ_acc[b][m][h][0]);
-    const auto XQO_base_offset  = b * XQ_stride_b + m * XQ_stride_m + h * XQ_stride_h;
+    const auto XQO_base_offset =
+        b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
     const auto* __restrict__ q_ = XQ + XQO_base_offset;
 
-    const auto cache_KV_base_offset       = b * K_stride_b + (multiquery ? 0 : h * K_stride_h);
+    const auto cache_KV_base_offset =
+        b * K_stride_b + 0 * K_stride_m + g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
     const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
     const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
 
@@ -341,11 +347,14 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                     const int32_t* __restrict__ seq_kv_lens;
                     const ptrdiff_t XQ_stride_b;
                     const ptrdiff_t XQ_stride_m;
+                    const ptrdiff_t XQ_stride_g;
                     const ptrdiff_t XQ_stride_h;
                     const ptrdiff_t K_stride_b;
                     const ptrdiff_t K_stride_m;
+                    const ptrdiff_t K_stride_g;
                     const ptrdiff_t K_stride_h;
                     const int32_t Q_size_m;
+                    const int32_t Q_size_g;
                     const int32_t Q_size_h;
                     const int32_t Q_size_k;
                     const int32_t K_size_m;
@@ -363,11 +372,14 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                              const int32_t* __restrict__ seq_kv_lens,
                              const ptrdiff_t XQ_stride_b,
                              const ptrdiff_t XQ_stride_m,
+                             const ptrdiff_t XQ_stride_g,
                              const ptrdiff_t XQ_stride_h,
                              const ptrdiff_t K_stride_b,
                              const ptrdiff_t K_stride_m,
+                             const ptrdiff_t K_stride_g,
                              const ptrdiff_t K_stride_h,
                              const int32_t Q_size_m,
+                             const int32_t Q_size_g,
                              const int32_t Q_size_h,
                              const int32_t Q_size_k,
                              const int32_t K_size_m,
@@ -383,11 +395,14 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                           seq_kv_lens(seq_kv_lens),
                           XQ_stride_b(XQ_stride_b),
                           XQ_stride_m(XQ_stride_m),
+                          XQ_stride_g(XQ_stride_g),
                           XQ_stride_h(XQ_stride_h),
                           K_stride_b(K_stride_b),
                           K_stride_m(K_stride_m),
+                          K_stride_g(K_stride_g),
                           K_stride_h(K_stride_h),
                           Q_size_m(Q_size_m),
+                          Q_size_g(Q_size_g),
                           Q_size_h(Q_size_h),
                           Q_size_k(Q_size_k),
                           K_size_m(K_size_m),
@@ -447,11 +462,14 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                             arg.seq_kv_lens,
                             arg.XQ_stride_b,
                             arg.XQ_stride_m,
+                            arg.XQ_stride_g,
                             arg.XQ_stride_h,
                             arg.K_stride_b,
                             arg.K_stride_m,
+                            arg.K_stride_g,
                             arg.K_stride_h,
                             arg.Q_size_m,
+                            arg.Q_size_g,
                             arg.Q_size_h,
                             arg.Q_size_k,
                             arg.K_size_m,
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 9efad083c..2fee16a00 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -75,35 +75,35 @@ def apply(
         if needs_gradient:
             raise NotImplementedError("backward pass is not supported")
         attn_bias = inp.attn_bias
-
+        q, k, v = inp.get_qkv_in_bmghk()
         if attn_bias is not None:
-            attn_bias.k_seqinfo.to(inp.key.device)
-            attn_bias.q_seqinfo.to(inp.query.device)
+            attn_bias.k_seqinfo.to(k.device)
+            attn_bias.q_seqinfo.to(q.device)
             padding = attn_bias.k_seqinfo.padding
             seq_positions_gpu = attn_bias.k_seqinfo.seqlen
         else:
-            padding = inp.key.shape[1]
+            padding = k.shape[1]
             seq_positions_gpu = None
 
         if attn_bias is not None:
-            # key: (1, B * padding, 1 if multiquery else Hkv, D)
+            # key: (1, B * padding, G, 1 if multiquery else Hkv, D)
             # value: like key
-            # query: (1, B * q_seqlen, Hq, D)
-            multiquery = inp.key.stride(2) == 0
+            # query: (1, B * q_seqlen, G, Hq, D)
+            multiquery = k.stride(3) == 0
             if multiquery:
-                key = inp.key[0, :, :1].unflatten(0, (-1, padding))
-                value = inp.value[0, :, :1].unflatten(0, (-1, padding))
+                key = k[0, :, :, :1].unflatten(0, (-1, padding))
+                value = v[0, :, :, :1].unflatten(0, (-1, padding))
             else:
-                key = inp.key[0].unflatten(0, (-1, padding))
-                value = inp.value[0].unflatten(0, (-1, padding))
-            query = inp.query[0].unflatten(0, (key.shape[0], -1))    
+                key = k[0].unflatten(0, (-1, padding))
+                value = v[0].unflatten(0, (-1, padding))
+            query = q[0].unflatten(0, (key.shape[0], -1))    
         else:
-            # key: (B, padding, 1 if multiquery else Hkv, D)
+            # key: (B, padding, G, 1 if multiquery else Hkv, D)
             # value: like key
-            # query: (B, q_seqlen, Hq, D)
-            key = inp.key
-            query = inp.query
-            value = inp.value
+            # query: (B, q_seqlen, G, Hq, D)
+            key = k
+            query = q
+            value = v
 
         if inp.scale is not None:
             qk_scale = inp.scale
diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index b318342aa..db0a33344 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -104,9 +104,11 @@ def normalize_bmhk(self) -> Tuple[int, ...]:
 
     def validate_inputs(self) -> None:
         qkv = (self.query, self.key, self.value)
-        if self.query.ndim not in (3, 4) or any(x.ndim != self.query.ndim for x in qkv):
+        if self.query.ndim not in (3, 4, 5) or any(
+            x.ndim != self.query.ndim for x in qkv
+        ):
             raise ValueError(
-                f"Query/Key/Value should all have BMHK or BMK shape.\n"
+                f"Query/Key/Value should all have BMGHK, BMHK or BMK shape.\n"
                 f"  query.shape: {self.query.shape}\n"
                 f"  key.shape  : {self.key.shape}\n"
                 f"  value.shape: {self.value.shape}"

From bcceb6b26aa415ef24e9e9b6ece7f1328209c7e4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 25 Nov 2023 01:07:54 -0500
Subject: [PATCH 259/641] benchmark bmghk

---
 xformers/benchmarks/benchmark_attn_decoding.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 1a729a645..8747db664 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -17,13 +17,9 @@
 
 
 CASES = [
-    dict(B=max(1, 2 ** (16 - i)), Mq=1, Mkv=2**i, Hq=16, Hkv=1, K=128)
-    for i in range(8, 18)
+    dict(B=max(1, 2 ** (16 - i)), Mq=1, Mkv=2**i, Hq=16, Hkv=hkv, K=128)
+    for i in range(8, 18) for hkv in (1, 2)
 ]
-# + [
-#     dict(B=max(1, 2 ** (16 - i)), Mq=1, Mkv=2**i, Hq=16, Hkv=2, K=128)
-#     for i in range(8, 18)
-# ]
 
 
 def _setup_test(
@@ -98,21 +94,19 @@ def __init__(
     def fw(self) -> None:
         try:
             xops.memory_efficient_attention_forward(self.q, self.k, self.v, op=self.OP)
-        except RuntimeError as e:
+        except (RuntimeError, ValueError) as e:
             print(f"Runtime error: {e}")
 
 
-# class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
-#     OP = xops.fmha.triton_splitk.FwOp
+class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
+    OP = xops.fmha.triton_splitk.FwOp
 
 
 class AttentionDecodingCK(AttentionDecodingFlashDecoding):
-
     OP = xops.fmha.ck.FwOp
 
 
 class AttentionDecodingCKDecoder(AttentionDecodingFlashDecoding):
-
     OP = xops.fmha.ck_decoder.FwOp
 
 
From 6564d6901f31851d26128303c0891ef352046ad0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sun, 26 Nov 2023 00:41:27 -0500
Subject: [PATCH 260/641] comment back triton_splitk until merge with upstream
 happens

---
 xformers/benchmarks/benchmark_attn_decoding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 8747db664..64725dfd6 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -98,8 +98,8 @@ def fw(self) -> None:
             print(f"Runtime error: {e}")
 
 
-class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
-    OP = xops.fmha.triton_splitk.FwOp
+# class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
+#     OP = xops.fmha.triton_splitk.FwOp
 
 
 class AttentionDecodingCK(AttentionDecodingFlashDecoding):

From beb4383a07c5d18c84116cd9c53a305bb7b81052 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sun, 26 Nov 2023 01:22:49 -0500
Subject: [PATCH 261/641] fix comments and standalone decoder runner

---
 .../hip_fmha/attention_forward_decoder.cpp    | 147 +++---
 .../hip_fmha/ck_attention_forward_decoder.h   | 450 +++++++++---------
 2 files changed, 300 insertions(+), 297 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 3678157aa..b696831e4 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -56,13 +56,13 @@ template <int32_t ThreadsPerWavefront,
           int32_t WavefrontsPerBlock,
           int32_t T_MAX = 8192,
           int32_t D_H   = 256>
-at::Tensor&
-efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B, 1, H, D]
-                                                const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-                                                const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-                                                at::optional<at::Tensor> seq_kv_lens, // [B]
-                                                double qk_scale,
-                                                at::Tensor& O)
+at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, T_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, T_MAX, G, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale,
+    at::Tensor& O)
 {
     static_assert(4 * ThreadsPerWavefront == D_H, "");
     static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
@@ -153,8 +153,8 @@ efficient_attention_forward_decoder_ck_out_impl(const at::Tensor& XQ,      // [B
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor
-efficient_attention_forward_decoder_ck_impl(const at::Tensor& XQ,      // [B, 1, H, D]
-                                            const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+efficient_attention_forward_decoder_ck_impl(const at::Tensor& XQ,      // [B, 1, G, H, D]
+                                            const at::Tensor& cache_K, // [B, T_MAX, G, H or 1, D]
                                             const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
                                             at::optional<at::Tensor> seq_kv_lens, // [B]
                                             double qk_scale)
@@ -166,9 +166,9 @@ efficient_attention_forward_decoder_ck_impl(const at::Tensor& XQ,      // [B, 1,
 }
 
 at::Tensor
-efficient_attention_forward_decoder_ck(const at::Tensor& XQ,      // [B, 1, H, D]
-                                       const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-                                       const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+efficient_attention_forward_decoder_ck(const at::Tensor& XQ,      // [B, 1, G, H, D]
+                                       const at::Tensor& cache_K, // [B, T_MAX, G, H or 1, D]
+                                       const at::Tensor& cache_V, // [B, T_MAX, G, H or 1, D]
                                        at::optional<at::Tensor> seq_kv_lens, // [B]
                                        double qk_scale)
 {
@@ -200,11 +200,11 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m)
 (2) compile
  > mkdir build
  > cd build
- > cmake /xformers/xformers/csrc/attention/hip_fmha/ \ 
+ > cmake /xformers/xformers/csrc/attention/hip_fmha/ \
        -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
        -D CMAKE_PREFIX_PATH=/opt/rocm \
        -D CMAKE_BUILD_TYPE=Debug \
-       -D GPU_TARGETS="gfx90a" 
+       -D GPU_TARGETS="native" 
   > make
 
 (3a) run correctness check
@@ -221,15 +221,16 @@ static void do_correctness_check()
     const int32_t D = 4 * kThreadsPerWavefront;
     const int32_t B = 1;
     const int32_t H = 4;
+    const int32_t G = 1;
     auto options    = torch::TensorOptions()
                        .dtype(torch::kFloat32)
                        .layout(torch::kStrided)
                        .device(torch::kCUDA, 1)
                        .requires_grad(false);
     auto int_options = options.dtype(torch::kInt);
-    auto XQ          = at::randn({B, 1, H, D}, options);
-    auto K           = at::randn({B, 4096, H, D}, options);
-    auto V           = at::randn({B, 4096, H, D}, options);
+    auto XQ          = at::randn({B, 1, G, H, D}, options);
+    auto K           = at::randn({B, 4096, G, H, D}, options);
+    auto V           = at::randn({B, 4096, G, H, D}, options);
     auto seq         = at::randint(63, 128, {B}, int_options);
     double qk_scale  = 1. / sqrt(D);
 
@@ -246,76 +247,68 @@ int main(int argc, char** argv)
     {
         do_correctness_check();
     }
-    else
-    {
-        const auto args = std::vector<std::string>(argv + 1, argv + argc);
-        if(args.size() != 7)
-        {
-            std::cout << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype "
-                         "n_wavefronts_per_block"
-                      << std::endl;
-            return 0;
-        }
-        const int32_t n_keys                 = std::stoi(args[0]);
-        const int32_t padding                = std::stoi(args[1]);
-        const int32_t batch_size             = std::stoi(args[2]);
-        const int32_t n_heads                = std::stoi(args[3]);
-        const int32_t multiquery             = (args[4] == "mq");
-        const auto dtype                     = (args[5] == "f32")   ? torch::kFloat32
-                                               : (args[5] == "f16") ? torch::kFloat16
-                                                                    : torch::kBFloat16;
-        const int32_t n_wavefronts_per_block = std::stoi(args[6]);
-
-        const int32_t dim_per_head = 4 * kThreadsPerWavefront;
-
-        const auto options = torch::TensorOptions()
-                                 .dtype(dtype)
-                                 .layout(torch::kStrided)
-                                 .device(torch::kCUDA, 1)
-                                 .requires_grad(false);
-
-        const auto int_options = options.dtype(torch::kInt);
-        const auto Q           = at::rand({batch_size, 1, n_heads, dim_per_head}, options);
-        const auto K = multiquery ? at::rand({batch_size, padding, 1, dim_per_head}, options)
-                                        .expand({batch_size, padding, n_heads, dim_per_head})
-                                  : at::rand({batch_size, padding, n_heads, dim_per_head}, options);
-        const auto V = at::rand_like(K);
-        auto O       = at::rand_like(Q);
-
-        const auto seq        = at::randint(1, n_keys, {batch_size}, int_options);
-        const double qk_scale = 1. / sqrt(dim_per_head);
-        auto call_ptr =
-            decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront,
-                                                                      kWavefrontsPerBlock>){};
+    const int32_t n_keys                 = std::stoi(args[0]);
+    const int32_t padding                = std::stoi(args[1]);
+    const int32_t batch_size             = std::stoi(args[2]);
+    const int32_t n_heads                = std::stoi(args[3]);
+    const int32_t n_groups               = 1;
+    const int32_t multiquery             = (args[4] == "mq");
+    const auto dtype                     = (args[5] == "f32")   ? torch::kFloat32
+                                           : (args[5] == "f16") ? torch::kFloat16
+                                                                : torch::kBFloat16;
+    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
+
+    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
+
+    const auto options = torch::TensorOptions()
+                             .dtype(dtype)
+                             .layout(torch::kStrided)
+                             .device(torch::kCUDA, 1)
+                             .requires_grad(false);
+
+    const auto int_options = options.dtype(torch::kInt);
+    const auto Q           = at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
+    const auto K           = multiquery
+                                 ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
+                             .expand({batch_size, padding, n_groups, n_heads, dim_per_head})
+                                 : at::rand({batch_size, padding, n_groups, n_heads, dim_per_head}, options);
+    const auto V           = at::rand_like(K);
+    auto O                 = at::empty_like(Q);
+
+    const auto seq        = at::randint(1, n_keys, {batch_size}, int_options);
+    const double qk_scale = 1. / sqrt(dim_per_head);
+    auto call_ptr =
+        decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront,
+                                                                  kWavefrontsPerBlock>){};
 
 #define SWITCH_CASE_SET_CALLPTR(n)                                                              \
     case(n):                                                                                    \
         call_ptr = &efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, (n)>; \
         break;
 
-        switch(n_wavefronts_per_block)
-        {
-            SWITCH_CASE_SET_CALLPTR(1);
-            SWITCH_CASE_SET_CALLPTR(2);
-            SWITCH_CASE_SET_CALLPTR(4);
-            SWITCH_CASE_SET_CALLPTR(8);
-            SWITCH_CASE_SET_CALLPTR(16);
+    switch(n_wavefronts_per_block)
+    {
+        SWITCH_CASE_SET_CALLPTR(1);
+        SWITCH_CASE_SET_CALLPTR(2);
+        SWITCH_CASE_SET_CALLPTR(4);
+        SWITCH_CASE_SET_CALLPTR(8);
+        SWITCH_CASE_SET_CALLPTR(16);
 
-        default: call_ptr = nullptr; break;
-        }
+    default: call_ptr = nullptr; break;
+    }
 #undef SWITCH_CASE_SET_CALLPTR
 
-        if(call_ptr)
-        {
-            call_ptr(Q, K, V, seq, qk_scale, O);
-        }
-        else
-        {
-            std::cout << "Warning: no kernel was found for wavefronts_per_block="
-                      << n_wavefronts_per_block << std::endl;
-        }
+    if(call_ptr)
+    {
+        call_ptr(Q, K, V, seq, qk_scale, O);
     }
-    return 0;
+    else
+    {
+        std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                  << n_wavefronts_per_block << std::endl;
+    }
+}
+return 0;
 }
 
 #endif // MAIN
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 5d303f8a4..8270edc44 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -148,7 +148,7 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
     const int32_t wavefronts_per_block  = blockDim.y;
     const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
     const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
-    // const auto* q_ = &(XQ_acc[b][m][h][0]);
+    // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
     const auto XQO_base_offset =
         b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
     const auto* __restrict__ q_ = XQ + XQO_base_offset;
@@ -195,7 +195,7 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
             for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
             {
                 const int32_t t = tt + ttt;
-                // load the K[b][t][h|0][:] row into registers
+                // load the K[b][t][g][h|0][:] row into registers
                 load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
             }
         }
@@ -218,47 +218,22 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                     const int32_t t = tt + ttt;
                     if(t < t_max)
                     {
-                        // load the K[b][t][h|0][:] row into registers
+                        // load the K[b][t][g][h|0][:] row into registers
                         load_v<data_t, data_vec_t>(
                             cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
                     }
-                    compute_t qk_accs[n_loop_unroll] = {};
-#pragma unroll n_loop_unroll
-                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-                    {
-                        const int32_t t = tt + ttt;
-                        // load the V[b][t][h|0][:] row into registers, reusing K register
-                        // storage
-                        load_v<data_t, data_vec_t>(
-                            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                        ps[ttt] = smem[t];
-                    }
+                    // Each block computes different B value
+                    compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
-#pragma unroll n_loop_unroll
-                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-                    {
-                        o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-                    }
-                }
+                    // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+                    // Split T across wavefronts in a block, unroll loads to expose more
+                    // parallelism.
 
-                for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-                    tt += wavefronts_per_block * n_loop_unroll_tail)
-                {
-#pragma unroll n_loop_unroll_tail
-                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+                    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+                    for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+                        tt += wavefronts_per_block * n_loop_unroll_tail)
                     {
-                        const int32_t t = tt + ttt;
-                        if(t < t_max)
-                        {
-                            // load the V[b][t][h|0][:] row into registers, reusing K register
-                            // storage
-                            load_v<data_t, data_vec_t>(
-                                cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                            ps[ttt] = smem[t];
-                        }
-
-                        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-                            tt += wavefronts_per_block * n_loop_unroll_tail)
+                        if(lane_active_for_io)
                         {
 #pragma unroll n_loop_unroll_tail
                             for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
@@ -266,218 +241,253 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                                 const int32_t t = tt + ttt;
                                 if(t < t_max)
                                 {
-                                    // load the V[b][t][h|0][:] row into registers, reusing K
+                                    // load the K[b][t][h|0][:] row into registers
+                                    load_v<data_t, data_vec_t>(
+                                        cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                                }
+                                compute_t qk_accs[n_loop_unroll] = {};
+#pragma unroll n_loop_unroll
+                                for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+                                {
+                                    const int32_t t = tt + ttt;
+                                    // load the V[b][t][g][h|0][:] row into registers, reusing K
                                     // register storage
                                     load_v<data_t, data_vec_t>(
-                                        cache_V_base + t * K_stride_1, lane_idx, &k_loads[ttt]);
+                                        cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
                                     ps[ttt] = smem[t];
                                 }
-                            }
 
-#pragma unroll n_loop_unroll_tail
-                            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                            {
-                                const int32_t t = tt + ttt;
-                                if(t < t_max)
+#pragma unroll n_loop_unroll
+                                for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
                                 {
                                     o_acc = scalar_scale_acc<data_t, vec_size>(
                                         o_acc, k_loads[ttt], ps[ttt]);
                                 }
                             }
-                        }
-                    }
-                    // now, each thread has partial sums. Write to smem and get accumulated
-                    // results back.
-                    __syncthreads();
 
-                    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
-                    if(lane_active_for_io)
-                    {
-                        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
-                    }
+                            for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail;
+                                tt < t_max;
+                                tt += wavefronts_per_block * n_loop_unroll_tail)
+                            {
+#pragma unroll n_loop_unroll_tail
+                                for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+                                {
+                                    const int32_t t = tt + ttt;
+                                    if(t < t_max)
+                                    {
+                                        // load the V[b][t][g][h|0][:] row into registers, reusing K
+                                        // register storage
+                                        load_v<data_t, data_vec_t>(
+                                            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                                        ps[ttt] = smem[t];
+                                    }
+                                }
 
-                    __syncthreads();
-                    // sum up partial D rows from other wavefronts
-                    if(wavefront_idx == 0 && lane_active_for_io)
-                    {
-                        union
-                        {
-                            compute_vec_t vec = 0;
-                            compute_t arr[vec_size];
-                        } r;
-                        for(int32_t w = 0; w < wavefronts_per_block; ++w)
+#pragma unroll n_loop_unroll_tail
+                                for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+                                {
+                                    const int32_t t = tt + ttt;
+                                    if(t < t_max)
+                                    {
+                                        o_acc = scalar_scale_acc<data_t, vec_size>(
+                                            o_acc, k_loads[ttt], ps[ttt]);
+                                    }
+                                }
+                            }
+                        }
+                        // now, each thread has partial sums. Write to smem and get accumulated
+                        // results back.
+                        __syncthreads();
+
+                        // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) *
+                        // threadsPerBlock
+                        if(lane_active_for_io)
                         {
-                            compute_vec_t partial_r;
-                            load_v<compute_t, compute_vec_t>(
-                                smem, w * threads_per_wavefront + lane_idx, &partial_r);
-                            r.vec += partial_r;
+                            store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
                         }
-                        // elementwise convert from compute_t result to data_t out to be written
-                        union
+
+                        __syncthreads();
+                        // sum up partial D rows from other wavefronts
+                        if(wavefront_idx == 0 && lane_active_for_io)
                         {
-                            data_vec_t vec;
-                            data_t arr[vec_size];
-                        } bf_r;
+                            union
+                            {
+                                compute_vec_t vec = 0;
+                                compute_t arr[vec_size];
+                            } r;
+                            for(int32_t w = 0; w < wavefronts_per_block; ++w)
+                            {
+                                compute_vec_t partial_r;
+                                load_v<compute_t, compute_vec_t>(
+                                    smem, w * threads_per_wavefront + lane_idx, &partial_r);
+                                r.vec += partial_r;
+                            }
+                            // elementwise convert from compute_t result to data_t out to be written
+                            union
+                            {
+                                data_vec_t vec;
+                                data_t arr[vec_size];
+                            } bf_r;
 #pragma unroll
-                        for(int32_t i = 0; i < vec_size; ++i)
-                        {
-                            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+                            for(int32_t i = 0; i < vec_size; ++i)
+                            {
+                                bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+                            }
+                            // write output row O[b][m][g][h][:]
+                            data_t* __restrict__ o_ = O + XQO_base_offset;
+                            store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
                         }
-                        // write output row O[b][m][h][:]
-                        data_t* __restrict__ o_ = O + XQO_base_offset;
-                        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
                     }
-                }
 
-            } // namespace
+                } // namespace
 
-            namespace ck {
-            namespace tensor_operation {
-            namespace device {
-            template <typename scalar_t>
-            struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
-            {
-                using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-                struct Argument : public BaseArgument
+                namespace ck {
+                namespace tensor_operation {
+                namespace device {
+                template <typename scalar_t>
+                struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
                 {
-                    const scalar_t* __restrict__ XQ;
-                    const scalar_t* __restrict__ cache_K;
-                    const scalar_t* __restrict__ cache_V;
-                    scalar_t* __restrict__ O;
-                    const int32_t* __restrict__ seq_kv_lens;
-                    const ptrdiff_t XQ_stride_b;
-                    const ptrdiff_t XQ_stride_m;
-                    const ptrdiff_t XQ_stride_g;
-                    const ptrdiff_t XQ_stride_h;
-                    const ptrdiff_t K_stride_b;
-                    const ptrdiff_t K_stride_m;
-                    const ptrdiff_t K_stride_g;
-                    const ptrdiff_t K_stride_h;
-                    const int32_t Q_size_m;
-                    const int32_t Q_size_g;
-                    const int32_t Q_size_h;
-                    const int32_t Q_size_k;
-                    const int32_t K_size_m;
-                    const bool multiquery;
-                    const float qk_scale;
-
-                    const dim3 grid_dim;
-                    const dim3 block_dim;
-                    const size_t lds_bytes;
-
-                    Argument(const scalar_t* __restrict__ XQ,
-                             const scalar_t* __restrict__ cache_K,
-                             const scalar_t* __restrict__ cache_V,
-                             scalar_t* __restrict__ O,
-                             const int32_t* __restrict__ seq_kv_lens,
-                             const ptrdiff_t XQ_stride_b,
-                             const ptrdiff_t XQ_stride_m,
-                             const ptrdiff_t XQ_stride_g,
-                             const ptrdiff_t XQ_stride_h,
-                             const ptrdiff_t K_stride_b,
-                             const ptrdiff_t K_stride_m,
-                             const ptrdiff_t K_stride_g,
-                             const ptrdiff_t K_stride_h,
-                             const int32_t Q_size_m,
-                             const int32_t Q_size_g,
-                             const int32_t Q_size_h,
-                             const int32_t Q_size_k,
-                             const int32_t K_size_m,
-                             const bool multiquery,
-                             const float qk_scale,
-                             const dim3 grid_dim,
-                             const dim3 block_dim,
-                             const size_t lds_bytes)
-                        : XQ(XQ),
-                          cache_K(cache_K),
-                          cache_V(cache_V),
-                          O(O),
-                          seq_kv_lens(seq_kv_lens),
-                          XQ_stride_b(XQ_stride_b),
-                          XQ_stride_m(XQ_stride_m),
-                          XQ_stride_g(XQ_stride_g),
-                          XQ_stride_h(XQ_stride_h),
-                          K_stride_b(K_stride_b),
-                          K_stride_m(K_stride_m),
-                          K_stride_g(K_stride_g),
-                          K_stride_h(K_stride_h),
-                          Q_size_m(Q_size_m),
-                          Q_size_g(Q_size_g),
-                          Q_size_h(Q_size_h),
-                          Q_size_k(Q_size_k),
-                          K_size_m(K_size_m),
-                          multiquery(multiquery),
-                          qk_scale(qk_scale),
-                          grid_dim(grid_dim),
-                          block_dim(block_dim),
-                          lds_bytes(lds_bytes)
+                    using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+                    struct Argument : public BaseArgument
                     {
-                    }
-                };
+                        const scalar_t* __restrict__ XQ;
+                        const scalar_t* __restrict__ cache_K;
+                        const scalar_t* __restrict__ cache_V;
+                        scalar_t* __restrict__ O;
+                        const int32_t* __restrict__ seq_kv_lens;
+                        const ptrdiff_t XQ_stride_b;
+                        const ptrdiff_t XQ_stride_m;
+                        const ptrdiff_t XQ_stride_g;
+                        const ptrdiff_t XQ_stride_h;
+                        const ptrdiff_t K_stride_b;
+                        const ptrdiff_t K_stride_m;
+                        const ptrdiff_t K_stride_g;
+                        const ptrdiff_t K_stride_h;
+                        const int32_t Q_size_m;
+                        const int32_t Q_size_g;
+                        const int32_t Q_size_h;
+                        const int32_t Q_size_k;
+                        const int32_t K_size_m;
+                        const bool multiquery;
+                        const float qk_scale;
+
+                        const dim3 grid_dim;
+                        const dim3 block_dim;
+                        const size_t lds_bytes;
+
+                        Argument(const scalar_t* __restrict__ XQ,
+                                 const scalar_t* __restrict__ cache_K,
+                                 const scalar_t* __restrict__ cache_V,
+                                 scalar_t* __restrict__ O,
+                                 const int32_t* __restrict__ seq_kv_lens,
+                                 const ptrdiff_t XQ_stride_b,
+                                 const ptrdiff_t XQ_stride_m,
+                                 const ptrdiff_t XQ_stride_g,
+                                 const ptrdiff_t XQ_stride_h,
+                                 const ptrdiff_t K_stride_b,
+                                 const ptrdiff_t K_stride_m,
+                                 const ptrdiff_t K_stride_g,
+                                 const ptrdiff_t K_stride_h,
+                                 const int32_t Q_size_m,
+                                 const int32_t Q_size_g,
+                                 const int32_t Q_size_h,
+                                 const int32_t Q_size_k,
+                                 const int32_t K_size_m,
+                                 const bool multiquery,
+                                 const float qk_scale,
+                                 const dim3 grid_dim,
+                                 const dim3 block_dim,
+                                 const size_t lds_bytes)
+                            : XQ(XQ),
+                              cache_K(cache_K),
+                              cache_V(cache_V),
+                              O(O),
+                              seq_kv_lens(seq_kv_lens),
+                              XQ_stride_b(XQ_stride_b),
+                              XQ_stride_m(XQ_stride_m),
+                              XQ_stride_g(XQ_stride_g),
+                              XQ_stride_h(XQ_stride_h),
+                              K_stride_b(K_stride_b),
+                              K_stride_m(K_stride_m),
+                              K_stride_g(K_stride_g),
+                              K_stride_h(K_stride_h),
+                              Q_size_m(Q_size_m),
+                              Q_size_g(Q_size_g),
+                              Q_size_h(Q_size_h),
+                              Q_size_k(Q_size_k),
+                              K_size_m(K_size_m),
+                              multiquery(multiquery),
+                              qk_scale(qk_scale),
+                              grid_dim(grid_dim),
+                              block_dim(block_dim),
+                              lds_bytes(lds_bytes)
+                        {
+                        }
+                    };
 
-                struct Invoker : public BaseInvoker
-                {
-                    using Argument = DeviceOp::Argument;
-                    float Run(const Argument& arg,
-                              const StreamConfig& stream_config = StreamConfig{})
+                    struct Invoker : public BaseInvoker
                     {
-                        auto threads_per_wavefront = arg.block_dim.x;
+                        using Argument = DeviceOp::Argument;
+                        float Run(const Argument& arg,
+                                  const StreamConfig& stream_config = StreamConfig{})
+                        {
+                            auto threads_per_wavefront = arg.block_dim.x;
 
-                        auto Q_size_k_alignment_necessary = 0;
+                            auto Q_size_k_alignment_necessary = 0;
 
-                        for(auto vec_size : {4, 2, 1})
-                        {
-                            if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                            for(auto vec_size : {4, 2, 1})
+                            {
+                                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                                {
+                                    Q_size_k_alignment_necessary = vec_size;
+                                }
+                            };
+
+                            if(!Q_size_k_alignment_necessary)
                             {
-                                Q_size_k_alignment_necessary = vec_size;
+                                throw std::runtime_error("Unsupported Q_size_k");
                             }
-                        };
 
-                        if(!Q_size_k_alignment_necessary)
-                        {
-                            throw std::runtime_error("Unsupported Q_size_k");
-                        }
+                            if(arg.Q_size_k % Q_size_k_alignment_necessary)
+                            {
+                                throw std::runtime_error("Unsupported alignment for Q_size_k");
+                            }
 
-                        if(arg.Q_size_k % Q_size_k_alignment_necessary)
-                        {
-                            throw std::runtime_error("Unsupported alignment for Q_size_k");
+                            return launch_and_time_kernel(
+                                stream_config,
+                                Q_size_k_alignment_necessary == 4
+                                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+                                : Q_size_k_alignment_necessary == 2
+                                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                                : Q_size_k_alignment_necessary == 1
+                                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+                                    : nullptr,
+                                arg.grid_dim,
+                                arg.block_dim,
+                                arg.lds_bytes,
+                                arg.XQ,
+                                arg.cache_K,
+                                arg.cache_V,
+                                arg.O,
+                                arg.seq_kv_lens,
+                                arg.XQ_stride_b,
+                                arg.XQ_stride_m,
+                                arg.XQ_stride_g,
+                                arg.XQ_stride_h,
+                                arg.K_stride_b,
+                                arg.K_stride_m,
+                                arg.K_stride_g,
+                                arg.K_stride_h,
+                                arg.Q_size_m,
+                                arg.Q_size_g,
+                                arg.Q_size_h,
+                                arg.Q_size_k,
+                                arg.K_size_m,
+                                arg.multiquery,
+                                arg.qk_scale);
                         }
-
-                        return launch_and_time_kernel(
-                            stream_config,
-                            Q_size_k_alignment_necessary == 4
-                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-                            : Q_size_k_alignment_necessary == 2
-                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                            : Q_size_k_alignment_necessary == 1
-                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-                                : nullptr,
-                            arg.grid_dim,
-                            arg.block_dim,
-                            arg.lds_bytes,
-                            arg.XQ,
-                            arg.cache_K,
-                            arg.cache_V,
-                            arg.O,
-                            arg.seq_kv_lens,
-                            arg.XQ_stride_b,
-                            arg.XQ_stride_m,
-                            arg.XQ_stride_g,
-                            arg.XQ_stride_h,
-                            arg.K_stride_b,
-                            arg.K_stride_m,
-                            arg.K_stride_g,
-                            arg.K_stride_h,
-                            arg.Q_size_m,
-                            arg.Q_size_g,
-                            arg.Q_size_h,
-                            arg.Q_size_k,
-                            arg.K_size_m,
-                            arg.multiquery,
-                            arg.qk_scale);
-                    }
+                    };
                 };
-            };
-            } // namespace device
-            } // namespace tensor_operation
-            } // namespace ck
+                } // namespace device
+                } // namespace tensor_operation
+                } // namespace ck

From f306a0a18b6a6e0d754688974208a5740f593547 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sun, 26 Nov 2023 01:49:44 -0500
Subject: [PATCH 262/641] fix comments

---
 .../hip_fmha/attention_forward_decoder.cpp    |  38 +-
 .../hip_fmha/ck_attention_forward_decoder.h   | 609 ++++++++++--------
 2 files changed, 361 insertions(+), 286 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index b696831e4..7a780f1ba 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -15,8 +15,8 @@
 namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
 constexpr int32_t kWavefrontsPerBlock  = 16;
-constexpr int32_t D_H                  = 4 * kThreadsPerWavefront;
-} // namespace
+constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
+}
 
 namespace {
 
@@ -54,17 +54,17 @@ namespace {
 
 template <int32_t ThreadsPerWavefront,
           int32_t WavefrontsPerBlock,
-          int32_t T_MAX = 8192,
-          int32_t D_H   = 256>
+          int32_t KV_M_MAX = 8192,
+          int32_t K_MAX    = 256>
 at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, T_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, T_MAX, G, H or 1, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     at::Tensor& O)
 {
-    static_assert(4 * ThreadsPerWavefront == D_H, "");
+    static_assert(4 * ThreadsPerWavefront == K_MAX, "");
     static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
 
     at::OptionalDeviceGuard guard(XQ.device());
@@ -74,8 +74,8 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
 
     TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
 
-    TORCH_CHECK(cache_K.size(1) <= T_MAX);
-    TORCH_CHECK(cache_K.size(4) <= D_H);
+    TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
+    TORCH_CHECK(cache_K.size(4) <= K_MAX);
 
     constexpr auto rank = 5;
 
@@ -91,8 +91,8 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     dim3 blocks(B * H * M * G);
     dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
-    int32_t smem_softmax = T_MAX * sizeof(float) + threads.y * sizeof(float);
-    int32_t smem_output  = D_H * sizeof(float) *
+    int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+    int32_t smem_output  = K_MAX * sizeof(float) *
                           threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
     const size_t lds_bytes = max(smem_softmax, smem_output);
     auto stream            = at::cuda::getCurrentHIPStream().stream();
@@ -152,12 +152,12 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
 #undef AT_DISPATCH_SWITCH_3
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
-at::Tensor
-efficient_attention_forward_decoder_ck_impl(const at::Tensor& XQ,      // [B, 1, G, H, D]
-                                            const at::Tensor& cache_K, // [B, T_MAX, G, H or 1, D]
-                                            const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-                                            at::optional<at::Tensor> seq_kv_lens, // [B]
-                                            double qk_scale)
+at::Tensor efficient_attention_forward_decoder_ck_impl(
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale)
 {
     auto O = at::empty_like(XQ);
     efficient_attention_forward_decoder_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
@@ -167,8 +167,8 @@ efficient_attention_forward_decoder_ck_impl(const at::Tensor& XQ,      // [B, 1,
 
 at::Tensor
 efficient_attention_forward_decoder_ck(const at::Tensor& XQ,      // [B, 1, G, H, D]
-                                       const at::Tensor& cache_K, // [B, T_MAX, G, H or 1, D]
-                                       const at::Tensor& cache_V, // [B, T_MAX, G, H or 1, D]
+                                       const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+                                       const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
                                        at::optional<at::Tensor> seq_kv_lens, // [B]
                                        double qk_scale)
 {
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 8270edc44..ae13c44af 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -106,7 +106,7 @@ template <typename scalar_t,
           int32_t vec_size               = 4,
           int32_t n_loop_unroll          = 16,
           int32_t n_loop_unroll_tail     = 2,
-          int32_t T_MAX                  = 8192,
+          int32_t KV_M_MAX               = 8192,
           int32_t n_wavefronts_per_block = 16>
 __global__ void
 efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
@@ -158,9 +158,6 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
     const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
     const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
 
-    // Load Q into registers in all wavefronts.
-    // Each thread handles `vec_size` D dimensions
-
     using data_t        = scalar_t;
     using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
     using compute_t     = float;
@@ -171,16 +168,24 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
     extern __shared__ __align__(16) compute_t smem[];
 
     data_vec_t q_thread = 0;
+    // Load Q into registers in all wavefronts.
+    // Each thread handles `vec_size` D dimensions
     if(lane_active_for_io)
     {
         load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
     }
-    // Each block computes different B value
+
     compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
-    // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-    // Split T across wavefronts in a block, unroll loads to expose more
-    // parallelism.
+    // Compute S[0:t_max] =
+    // ```
+    // for t in range(t_max):
+    //   S[t] = dot(Q, K[t])
+    // ```
+    // Split the 0:t_max range across wavefronts in a block,
+    // unroll loads to expose more parallelism.
+    // Reduce the dot product with cross-lane operation;
+    // Q and K[t] are in the registers of threads in a single wavefront.
 
     data_vec_t k_loads[n_loop_unroll] = {};
 
@@ -206,288 +211,358 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
         // Split T across wavefronts in a block, unroll loads to expose more
         // parallelism.
 
-        // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-            tt += wavefronts_per_block * n_loop_unroll_tail)
+        data_vec_t k_loads[n_loop_unroll] = {};
+
+        constexpr auto dtt         = n_wavefronts_per_block * n_loop_unroll;
+        const int32_t t_max_unroll = (t_max / dtt) * dtt;
+
+        for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
         {
             if(lane_active_for_io)
             {
-#pragma unroll n_loop_unroll_tail
-                for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+#pragma unroll n_loop_unroll
+                for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
                 {
                     const int32_t t = tt + ttt;
-                    if(t < t_max)
-                    {
-                        // load the K[b][t][g][h|0][:] row into registers
-                        load_v<data_t, data_vec_t>(
-                            cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                    }
-                    // Each block computes different B value
-                    compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+                    // load the K[b][t][g][h|0][:] row into registers
+                    load_v<data_t, data_vec_t>(
+                        cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                }
+            }
+            // Each block computes different B value
+            compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
-                    // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-                    // Split T across wavefronts in a block, unroll loads to expose more
-                    // parallelism.
+            // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+            // Split T across wavefronts in a block, unroll loads to expose more
+            // parallelism.
 
-                    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-                    for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-                        tt += wavefronts_per_block * n_loop_unroll_tail)
+            // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+            for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+                tt += wavefronts_per_block * n_loop_unroll_tail)
+            {
+                if(lane_active_for_io)
+                {
+#pragma unroll n_loop_unroll_tail
+                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
                     {
-                        if(lane_active_for_io)
+                        const int32_t t = tt + ttt;
+                        if(t < t_max)
                         {
-#pragma unroll n_loop_unroll_tail
-                            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                            {
-                                const int32_t t = tt + ttt;
-                                if(t < t_max)
-                                {
-                                    // load the K[b][t][h|0][:] row into registers
-                                    load_v<data_t, data_vec_t>(
-                                        cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                                }
-                                compute_t qk_accs[n_loop_unroll] = {};
-#pragma unroll n_loop_unroll
-                                for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-                                {
-                                    const int32_t t = tt + ttt;
-                                    // load the V[b][t][g][h|0][:] row into registers, reusing K
-                                    // register storage
-                                    load_v<data_t, data_vec_t>(
-                                        cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                                    ps[ttt] = smem[t];
-                                }
-
-#pragma unroll n_loop_unroll
-                                for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-                                {
-                                    o_acc = scalar_scale_acc<data_t, vec_size>(
-                                        o_acc, k_loads[ttt], ps[ttt]);
-                                }
-                            }
-
-                            for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail;
-                                tt < t_max;
-                                tt += wavefronts_per_block * n_loop_unroll_tail)
-                            {
-#pragma unroll n_loop_unroll_tail
-                                for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                                {
-                                    const int32_t t = tt + ttt;
-                                    if(t < t_max)
-                                    {
-                                        // load the V[b][t][g][h|0][:] row into registers, reusing K
-                                        // register storage
-                                        load_v<data_t, data_vec_t>(
-                                            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                                        ps[ttt] = smem[t];
-                                    }
-                                }
-
-#pragma unroll n_loop_unroll_tail
-                                for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                                {
-                                    const int32_t t = tt + ttt;
-                                    if(t < t_max)
-                                    {
-                                        o_acc = scalar_scale_acc<data_t, vec_size>(
-                                            o_acc, k_loads[ttt], ps[ttt]);
-                                    }
-                                }
-                            }
+                            // load the K[b][t][g][h|0][:] row into registers
+                            load_v<data_t, data_vec_t>(
+                                cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
                         }
-                        // now, each thread has partial sums. Write to smem and get accumulated
-                        // results back.
-                        __syncthreads();
+                        // Each block computes different B value
+                        compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
 
-                        // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) *
-                        // threadsPerBlock
-                        if(lane_active_for_io)
-                        {
-                            store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
-                        }
+                        // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
+                        // Split T across wavefronts in a block, unroll loads to expose more
+                        // parallelism.
 
-                        __syncthreads();
-                        // sum up partial D rows from other wavefronts
-                        if(wavefront_idx == 0 && lane_active_for_io)
+                        // write accumulated sums to smem.
+                        if(lane_idx == 0)
                         {
-                            union
-                            {
-                                compute_vec_t vec = 0;
-                                compute_t arr[vec_size];
-                            } r;
-                            for(int32_t w = 0; w < wavefronts_per_block; ++w)
-                            {
-                                compute_vec_t partial_r;
-                                load_v<compute_t, compute_vec_t>(
-                                    smem, w * threads_per_wavefront + lane_idx, &partial_r);
-                                r.vec += partial_r;
-                            }
-                            // elementwise convert from compute_t result to data_t out to be written
-                            union
-                            {
-                                data_vec_t vec;
-                                data_t arr[vec_size];
-                            } bf_r;
-#pragma unroll
-                            for(int32_t i = 0; i < vec_size; ++i)
-                            {
-                                bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
-                            }
-                            // write output row O[b][m][g][h][:]
-                            data_t* __restrict__ o_ = O + XQO_base_offset;
-                            store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+                            smem[t] = qk_acc;
                         }
                     }
+                }
+            }
+
+            // Use shared reduction to compute max and compute softmax on shared memory.
+            // write max acc
+            if(lane_idx == 0)
+            {
+                smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+            }
+            __syncthreads();
+            if(lane_idx < wavefronts_per_block)
+            {
+                max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
+            }
+            // shared across all threads in block
+            max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+
+            // each wavefront computes partial sum of exp.
+            compute_t softmax_denominator = 0.0f;
+            for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+            {
+                softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+            }
+            softmax_denominator =
+                wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+
+            if(lane_idx == 0)
+            {
+                smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
+            }
+            __syncthreads();
+
+            // now, compute sum of exp(x - max(x)) over all intermediate results.
+            softmax_denominator = 0.0;
+            if(lane_idx < wavefronts_per_block)
+            {
+                softmax_denominator = smem[KV_M_MAX + lane_idx];
+            }
+            softmax_denominator =
+                wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+
+            const compute_t softmax_scale_factor = 1. / softmax_denominator;
+            // now, compute the normalization across all threads.
+            for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+            {
+                smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+            }
+            __syncthreads();
+
+            // Split T across wavefronts in a block
+            // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+            // outputs are of size float[D]
+
+            compute_t ps[n_loop_unroll] = {};
+            compute_vec_t o_acc         = 0;
+            if(lane_active_for_io)
+            {
+                for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
+                {
+#pragma unroll n_loop_unroll
+                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+                    {
+                        const int32_t t = tt + ttt;
+                        // load the V[b][t][g][h|0][:] row into registers, reusing K
+                        // register storage
+                        load_v<data_t, data_vec_t>(
+                            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                        ps[ttt] = smem[t];
+                    }
 
-                } // namespace
+#pragma unroll n_loop_unroll
+                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+                    {
+                        o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+                    }
+                }
 
-                namespace ck {
-                namespace tensor_operation {
-                namespace device {
-                template <typename scalar_t>
-                struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
+                for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+                    tt += wavefronts_per_block * n_loop_unroll_tail)
                 {
-                    using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-                    struct Argument : public BaseArgument
+#pragma unroll n_loop_unroll_tail
+                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
                     {
-                        const scalar_t* __restrict__ XQ;
-                        const scalar_t* __restrict__ cache_K;
-                        const scalar_t* __restrict__ cache_V;
-                        scalar_t* __restrict__ O;
-                        const int32_t* __restrict__ seq_kv_lens;
-                        const ptrdiff_t XQ_stride_b;
-                        const ptrdiff_t XQ_stride_m;
-                        const ptrdiff_t XQ_stride_g;
-                        const ptrdiff_t XQ_stride_h;
-                        const ptrdiff_t K_stride_b;
-                        const ptrdiff_t K_stride_m;
-                        const ptrdiff_t K_stride_g;
-                        const ptrdiff_t K_stride_h;
-                        const int32_t Q_size_m;
-                        const int32_t Q_size_g;
-                        const int32_t Q_size_h;
-                        const int32_t Q_size_k;
-                        const int32_t K_size_m;
-                        const bool multiquery;
-                        const float qk_scale;
-
-                        const dim3 grid_dim;
-                        const dim3 block_dim;
-                        const size_t lds_bytes;
-
-                        Argument(const scalar_t* __restrict__ XQ,
-                                 const scalar_t* __restrict__ cache_K,
-                                 const scalar_t* __restrict__ cache_V,
-                                 scalar_t* __restrict__ O,
-                                 const int32_t* __restrict__ seq_kv_lens,
-                                 const ptrdiff_t XQ_stride_b,
-                                 const ptrdiff_t XQ_stride_m,
-                                 const ptrdiff_t XQ_stride_g,
-                                 const ptrdiff_t XQ_stride_h,
-                                 const ptrdiff_t K_stride_b,
-                                 const ptrdiff_t K_stride_m,
-                                 const ptrdiff_t K_stride_g,
-                                 const ptrdiff_t K_stride_h,
-                                 const int32_t Q_size_m,
-                                 const int32_t Q_size_g,
-                                 const int32_t Q_size_h,
-                                 const int32_t Q_size_k,
-                                 const int32_t K_size_m,
-                                 const bool multiquery,
-                                 const float qk_scale,
-                                 const dim3 grid_dim,
-                                 const dim3 block_dim,
-                                 const size_t lds_bytes)
-                            : XQ(XQ),
-                              cache_K(cache_K),
-                              cache_V(cache_V),
-                              O(O),
-                              seq_kv_lens(seq_kv_lens),
-                              XQ_stride_b(XQ_stride_b),
-                              XQ_stride_m(XQ_stride_m),
-                              XQ_stride_g(XQ_stride_g),
-                              XQ_stride_h(XQ_stride_h),
-                              K_stride_b(K_stride_b),
-                              K_stride_m(K_stride_m),
-                              K_stride_g(K_stride_g),
-                              K_stride_h(K_stride_h),
-                              Q_size_m(Q_size_m),
-                              Q_size_g(Q_size_g),
-                              Q_size_h(Q_size_h),
-                              Q_size_k(Q_size_k),
-                              K_size_m(K_size_m),
-                              multiquery(multiquery),
-                              qk_scale(qk_scale),
-                              grid_dim(grid_dim),
-                              block_dim(block_dim),
-                              lds_bytes(lds_bytes)
+                        const int32_t t = tt + ttt;
+                        if(t < t_max)
                         {
+                            // load the V[b][t][g][h|0][:] row into registers, reusing K
+                            // register storage
+                            load_v<data_t, data_vec_t>(
+                                cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                            ps[ttt] = smem[t];
                         }
-                    };
+                    }
 
-                    struct Invoker : public BaseInvoker
+#pragma unroll n_loop_unroll_tail
+                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
                     {
-                        using Argument = DeviceOp::Argument;
-                        float Run(const Argument& arg,
-                                  const StreamConfig& stream_config = StreamConfig{})
+                        const int32_t t = tt + ttt;
+                        if(t < t_max)
                         {
-                            auto threads_per_wavefront = arg.block_dim.x;
-
-                            auto Q_size_k_alignment_necessary = 0;
-
-                            for(auto vec_size : {4, 2, 1})
-                            {
-                                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
-                                {
-                                    Q_size_k_alignment_necessary = vec_size;
-                                }
-                            };
-
-                            if(!Q_size_k_alignment_necessary)
-                            {
-                                throw std::runtime_error("Unsupported Q_size_k");
-                            }
-
-                            if(arg.Q_size_k % Q_size_k_alignment_necessary)
-                            {
-                                throw std::runtime_error("Unsupported alignment for Q_size_k");
-                            }
-
-                            return launch_and_time_kernel(
-                                stream_config,
-                                Q_size_k_alignment_necessary == 4
-                                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-                                : Q_size_k_alignment_necessary == 2
-                                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                                : Q_size_k_alignment_necessary == 1
-                                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-                                    : nullptr,
-                                arg.grid_dim,
-                                arg.block_dim,
-                                arg.lds_bytes,
-                                arg.XQ,
-                                arg.cache_K,
-                                arg.cache_V,
-                                arg.O,
-                                arg.seq_kv_lens,
-                                arg.XQ_stride_b,
-                                arg.XQ_stride_m,
-                                arg.XQ_stride_g,
-                                arg.XQ_stride_h,
-                                arg.K_stride_b,
-                                arg.K_stride_m,
-                                arg.K_stride_g,
-                                arg.K_stride_h,
-                                arg.Q_size_m,
-                                arg.Q_size_g,
-                                arg.Q_size_h,
-                                arg.Q_size_k,
-                                arg.K_size_m,
-                                arg.multiquery,
-                                arg.qk_scale);
+                            o_acc =
+                                scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
                         }
-                    };
+                    }
+                }
+            }
+            // now, each thread has partial sums. Write to smem and get accumulated
+            // results back.
+            __syncthreads();
+
+            // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) *
+            // threadsPerBlock
+            if(lane_active_for_io)
+            {
+                store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+            }
+
+            __syncthreads();
+            // sum up partial D rows from other wavefronts
+            if(wavefront_idx == 0 && lane_active_for_io)
+            {
+                union
+                {
+                    compute_vec_t vec = 0;
+                    compute_t arr[vec_size];
+                } r;
+                for(int32_t w = 0; w < wavefronts_per_block; ++w)
+                {
+                    compute_vec_t partial_r;
+                    load_v<compute_t, compute_vec_t>(
+                        smem, w * threads_per_wavefront + lane_idx, &partial_r);
+                    r.vec += partial_r;
+                }
+                // elementwise convert from compute_t result to data_t out to be written
+                union
+                {
+                    data_vec_t vec;
+                    data_t arr[vec_size];
+                } bf_r;
+#pragma unroll
+                for(int32_t i = 0; i < vec_size; ++i)
+                {
+                    bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+                }
+                // write output row O[b][m][g][h][:]
+                data_t* __restrict__ o_ = O + XQO_base_offset;
+                store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+            }
+        }
+
+    } // namespace
+
+    namespace ck {
+    namespace tensor_operation {
+    namespace device {
+    template <typename scalar_t>
+    struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
+    {
+        using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+        struct Argument : public BaseArgument
+        {
+            const scalar_t* __restrict__ XQ;
+            const scalar_t* __restrict__ cache_K;
+            const scalar_t* __restrict__ cache_V;
+            scalar_t* __restrict__ O;
+            const int32_t* __restrict__ seq_kv_lens;
+            const ptrdiff_t XQ_stride_b;
+            const ptrdiff_t XQ_stride_m;
+            const ptrdiff_t XQ_stride_g;
+            const ptrdiff_t XQ_stride_h;
+            const ptrdiff_t K_stride_b;
+            const ptrdiff_t K_stride_m;
+            const ptrdiff_t K_stride_g;
+            const ptrdiff_t K_stride_h;
+            const int32_t Q_size_m;
+            const int32_t Q_size_g;
+            const int32_t Q_size_h;
+            const int32_t Q_size_k;
+            const int32_t K_size_m;
+            const bool multiquery;
+            const float qk_scale;
+
+            const dim3 grid_dim;
+            const dim3 block_dim;
+            const size_t lds_bytes;
+
+            Argument(const scalar_t* __restrict__ XQ,
+                     const scalar_t* __restrict__ cache_K,
+                     const scalar_t* __restrict__ cache_V,
+                     scalar_t* __restrict__ O,
+                     const int32_t* __restrict__ seq_kv_lens,
+                     const ptrdiff_t XQ_stride_b,
+                     const ptrdiff_t XQ_stride_m,
+                     const ptrdiff_t XQ_stride_g,
+                     const ptrdiff_t XQ_stride_h,
+                     const ptrdiff_t K_stride_b,
+                     const ptrdiff_t K_stride_m,
+                     const ptrdiff_t K_stride_g,
+                     const ptrdiff_t K_stride_h,
+                     const int32_t Q_size_m,
+                     const int32_t Q_size_g,
+                     const int32_t Q_size_h,
+                     const int32_t Q_size_k,
+                     const int32_t K_size_m,
+                     const bool multiquery,
+                     const float qk_scale,
+                     const dim3 grid_dim,
+                     const dim3 block_dim,
+                     const size_t lds_bytes)
+                : XQ(XQ),
+                  cache_K(cache_K),
+                  cache_V(cache_V),
+                  O(O),
+                  seq_kv_lens(seq_kv_lens),
+                  XQ_stride_b(XQ_stride_b),
+                  XQ_stride_m(XQ_stride_m),
+                  XQ_stride_g(XQ_stride_g),
+                  XQ_stride_h(XQ_stride_h),
+                  K_stride_b(K_stride_b),
+                  K_stride_m(K_stride_m),
+                  K_stride_g(K_stride_g),
+                  K_stride_h(K_stride_h),
+                  Q_size_m(Q_size_m),
+                  Q_size_g(Q_size_g),
+                  Q_size_h(Q_size_h),
+                  Q_size_k(Q_size_k),
+                  K_size_m(K_size_m),
+                  multiquery(multiquery),
+                  qk_scale(qk_scale),
+                  grid_dim(grid_dim),
+                  block_dim(block_dim),
+                  lds_bytes(lds_bytes)
+            {
+            }
+        };
+
+        struct Invoker : public BaseInvoker
+        {
+            using Argument = DeviceOp::Argument;
+            float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+            {
+                auto threads_per_wavefront = arg.block_dim.x;
+
+                auto Q_size_k_alignment_necessary = 0;
+
+                for(auto vec_size : {4, 2, 1})
+                {
+                    if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                    {
+                        Q_size_k_alignment_necessary = vec_size;
+                    }
                 };
-                } // namespace device
-                } // namespace tensor_operation
-                } // namespace ck
+
+                if(!Q_size_k_alignment_necessary)
+                {
+                    throw std::runtime_error("Unsupported Q_size_k");
+                }
+
+                if(arg.Q_size_k % Q_size_k_alignment_necessary)
+                {
+                    throw std::runtime_error("Unsupported alignment for Q_size_k");
+                }
+
+                return launch_and_time_kernel(
+                    stream_config,
+                    Q_size_k_alignment_necessary == 4
+                        ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+                    : Q_size_k_alignment_necessary == 2
+                        ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                    : Q_size_k_alignment_necessary == 1
+                        ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+                        : nullptr,
+                    arg.grid_dim,
+                    arg.block_dim,
+                    arg.lds_bytes,
+                    arg.XQ,
+                    arg.cache_K,
+                    arg.cache_V,
+                    arg.O,
+                    arg.seq_kv_lens,
+                    arg.XQ_stride_b,
+                    arg.XQ_stride_m,
+                    arg.XQ_stride_g,
+                    arg.XQ_stride_h,
+                    arg.K_stride_b,
+                    arg.K_stride_m,
+                    arg.K_stride_g,
+                    arg.K_stride_h,
+                    arg.Q_size_m,
+                    arg.Q_size_g,
+                    arg.Q_size_h,
+                    arg.Q_size_k,
+                    arg.K_size_m,
+                    arg.multiquery,
+                    arg.qk_scale);
+            }
+        };
+    };
+    } // namespace device
+    } // namespace tensor_operation
+    } // namespace ck

From f7bdc9982996204999b61ad0552c18bf8c8e9cde Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 27 Nov 2023 13:15:46 -0500
Subject: [PATCH 263/641] reflect bmghk in tests

---
 tests/test_mem_eff_attention_ck.py | 75 +++++++++++++++++++++++++-----
 xformers/ops/fmha/ck_decoder.py    |  4 --
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 9d6ec70fb..1b4286c01 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -209,6 +209,26 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
 
 
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+    if q.ndim == 5:
+        def attn_bias_group(group: int):
+            if isinstance(attn_bias, torch.Tensor):
+                return attn_bias[:, group]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    attn_bias._bias[:, group]
+                )
+            return attn_bias
+
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), dtype=dtype
+                )
+                for g in range(q.shape[2])
+            ],
+            dim=2,
+        )
+     
     if q.ndim == 4:
         assert p == 0.0
         return ref_attention_bmhk(q, k, v, attn_bias=attn_bias, dtype=dtype)
@@ -1620,30 +1640,61 @@ def test_attn_bias_padded() -> None:
     )
 
 
+def _kv_heads_label(kv_heads: Optional[int]) -> str:
+    if kv_heads is None:
+        return ""
+    if kv_heads == 1:
+        return "mq"
+    return f"gqa{kv_heads}"
+
+
 @pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
-@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "nomq")
-@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)], ids=lambda x: f"bsz-nh={x}")
-@pytest.mark.parametrize("padding", [32, 4096], ids=lambda x: f"pad={x}")
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
+@pytest.mark.parametrize("padding", [32, 4096])
 @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
 def test_decoder(
-    op, multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str
+    op,
+    n_heads: int,
+    kv_heads: Optional[int],
+    padding: int,
+    bsz: int,
+    dtype: str,
+    dequant: bool = False,
+    num_queries: int = 1,
+    d = 256,
 ) -> None:
+    # kv_heads = 1: multiquery
+    # kv_heads = None: neither MQA nor GQA
+    # kv_heads > 1: BMGHK
     dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
+    tensor_options = {"dtype": dtype_, "device": "cuda"}
     torch.manual_seed(1)
-    d = 256
     num_queries = 1
-    k_shape = (1, bsz * padding, n_heads, d)
-    k = torch.randn(k_shape, dtype=dtype_).cuda()
+    if kv_heads is not None and kv_heads > 1:
+        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
+        q_shape: Tuple[int, ...] = (
+            1,
+            bsz * num_queries,
+            kv_heads,
+            n_heads,
+            d,
+        )
+    else:
+        k_shape = (1, bsz * padding, n_heads, d)
+        q_shape = (1, bsz * num_queries, n_heads, d)
+
+    k = torch.randn(k_shape, **tensor_options)
     k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
-    v = torch.randn(k_shape, dtype=dtype_).cuda()
-    q = torch.randn((1, bsz * num_queries, n_heads, d), dtype=dtype_).cuda()
+    v = torch.randn_like(k)
+    q = torch.randn(q_shape, **tensor_options)
     causal_diagonal = torch.tensor(  # TODO: make unnecessary
         [i - 1 for i in k_seqlen], dtype=torch.int32
     ).cuda()
 
-    if multiquery:
-        k = k[:, :, :1].expand(k_shape)
-        v = v[:, :, :1].expand(k_shape)
+    if kv_heads is not None:
+        k = k[..., :1, :].expand(k_shape)
+        v = v[..., :1, :].expand(k_shape)
 
     attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
         q_seqlen=[num_queries] * bsz,
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 2fee16a00..ff4a0fd60 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -27,10 +27,6 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
 
         attn_bias = d.attn_bias
         if isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
-            # If we don't get here, we've an error elsewhere
-            if d.query.ndim != 4 or d.key.ndim != 4:
-                reasons.append("Inputs must be BMHK. BMK not supported")
-
             if d.query.shape[0] != 1:
                 reasons.append(f"One formal batch element expected; got {d.query.shape[0]}")
 

From f0f17f5b5c9721751d1857d3f1ed19c4026a0a83 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 6 Dec 2023 13:46:05 -0500
Subject: [PATCH 264/641] fix rebase conflicts and clang-format

---
 .../hip_fmha/attention_forward_decoder.cpp    | 115 ++--
 .../hip_fmha/ck_attention_forward_decoder.h   | 620 +++++++++---------
 2 files changed, 371 insertions(+), 364 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 7a780f1ba..76fd3228c 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -4,6 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
@@ -16,7 +17,7 @@ namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
 constexpr int32_t kWavefrontsPerBlock  = 16;
 constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
-}
+} // namespace
 
 namespace {
 
@@ -247,68 +248,78 @@ int main(int argc, char** argv)
     {
         do_correctness_check();
     }
-    const int32_t n_keys                 = std::stoi(args[0]);
-    const int32_t padding                = std::stoi(args[1]);
-    const int32_t batch_size             = std::stoi(args[2]);
-    const int32_t n_heads                = std::stoi(args[3]);
-    const int32_t n_groups               = 1;
-    const int32_t multiquery             = (args[4] == "mq");
-    const auto dtype                     = (args[5] == "f32")   ? torch::kFloat32
-                                           : (args[5] == "f16") ? torch::kFloat16
-                                                                : torch::kBFloat16;
-    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
-
-    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
-
-    const auto options = torch::TensorOptions()
-                             .dtype(dtype)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA, 1)
-                             .requires_grad(false);
-
-    const auto int_options = options.dtype(torch::kInt);
-    const auto Q           = at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
-    const auto K           = multiquery
-                                 ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
+    else
+    {
+        const auto args = std::vector<std::string>(argv + 1, argv + argc);
+        if(args.size() != 7)
+        {
+            std::cout << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype "
+                         "n_wavefronts_per_block"
+                      << std::endl;
+            return 0;
+        }
+        const int32_t n_keys                 = std::stoi(args[0]);
+        const int32_t padding                = std::stoi(args[1]);
+        const int32_t batch_size             = std::stoi(args[2]);
+        const int32_t n_heads                = std::stoi(args[3]);
+        const int32_t n_groups               = 1;
+        const int32_t multiquery             = (args[4] == "mq");
+        const auto dtype                     = (args[5] == "f32")   ? torch::kFloat32
+                                               : (args[5] == "f16") ? torch::kFloat16
+                                                                    : torch::kBFloat16;
+        const int32_t n_wavefronts_per_block = std::stoi(args[6]);
+
+        const int32_t dim_per_head = 4 * kThreadsPerWavefront;
+
+        const auto options = torch::TensorOptions()
+                                 .dtype(dtype)
+                                 .layout(torch::kStrided)
+                                 .device(torch::kCUDA, 1)
+                                 .requires_grad(false);
+
+        const auto int_options = options.dtype(torch::kInt);
+        const auto Q = at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
+        const auto K =
+            multiquery ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
                              .expand({batch_size, padding, n_groups, n_heads, dim_per_head})
-                                 : at::rand({batch_size, padding, n_groups, n_heads, dim_per_head}, options);
-    const auto V           = at::rand_like(K);
-    auto O                 = at::empty_like(Q);
+                       : at::rand({batch_size, padding, n_groups, n_heads, dim_per_head}, options);
+        const auto V = at::rand_like(K);
+        auto O       = at::empty_like(Q);
 
-    const auto seq        = at::randint(1, n_keys, {batch_size}, int_options);
-    const double qk_scale = 1. / sqrt(dim_per_head);
-    auto call_ptr =
-        decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront,
-                                                                  kWavefrontsPerBlock>){};
+        const auto seq        = at::randint(1, n_keys, {batch_size}, int_options);
+        const double qk_scale = 1. / sqrt(dim_per_head);
+        auto call_ptr =
+            decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront,
+                                                                      kWavefrontsPerBlock>){};
 
 #define SWITCH_CASE_SET_CALLPTR(n)                                                              \
     case(n):                                                                                    \
         call_ptr = &efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, (n)>; \
         break;
 
-    switch(n_wavefronts_per_block)
-    {
-        SWITCH_CASE_SET_CALLPTR(1);
-        SWITCH_CASE_SET_CALLPTR(2);
-        SWITCH_CASE_SET_CALLPTR(4);
-        SWITCH_CASE_SET_CALLPTR(8);
-        SWITCH_CASE_SET_CALLPTR(16);
+        switch(n_wavefronts_per_block)
+        {
+            SWITCH_CASE_SET_CALLPTR(1);
+            SWITCH_CASE_SET_CALLPTR(2);
+            SWITCH_CASE_SET_CALLPTR(4);
+            SWITCH_CASE_SET_CALLPTR(8);
+            SWITCH_CASE_SET_CALLPTR(16);
 
-    default: call_ptr = nullptr; break;
-    }
+        default: call_ptr = nullptr; break;
+        }
 #undef SWITCH_CASE_SET_CALLPTR
 
-    if(call_ptr)
-    {
-        call_ptr(Q, K, V, seq, qk_scale, O);
-    }
-    else
-    {
-        std::cout << "Warning: no kernel was found for wavefronts_per_block="
-                  << n_wavefronts_per_block << std::endl;
+        if(call_ptr)
+        {
+            call_ptr(Q, K, V, seq, qk_scale, O);
+        }
+        else
+        {
+            std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                      << n_wavefronts_per_block << std::endl;
+        }
     }
-}
-return 0;
+    return 0;
 }
 
-#endif // MAIN
+#endif // MAIN
\ No newline at end of file
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index ae13c44af..381bb4ed8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -204,365 +204,361 @@ efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
                 load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
             }
         }
-        // Each block computes different B value
-        compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
-
-        // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-        // Split T across wavefronts in a block, unroll loads to expose more
-        // parallelism.
-
-        data_vec_t k_loads[n_loop_unroll] = {};
+        compute_t qk_accs[n_loop_unroll] = {};
+#pragma unroll n_loop_unroll
+        for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+        {
+            ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+                q_thread, k_loads[ttt], qk_accs[ttt]);
+            qk_accs[ttt] *= qk_scale;
 
-        constexpr auto dtt         = n_wavefronts_per_block * n_loop_unroll;
-        const int32_t t_max_unroll = (t_max / dtt) * dtt;
+            qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
+            max_qk_acc   = ck::math::max(qk_accs[ttt], max_qk_acc);
+        }
+        if(lane_idx == 0)
+        {
+            auto* __restrict__ smem_base = smem + tt;
+#pragma unroll n_loop_unroll
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                smem_base[ttt] = qk_accs[ttt];
+            }
+        }
+    }
 
-        for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
+    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+    for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+        tt += wavefronts_per_block * n_loop_unroll_tail)
+    {
+        if(lane_active_for_io)
         {
-            if(lane_active_for_io)
+#pragma unroll n_loop_unroll_tail
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
             {
-#pragma unroll n_loop_unroll
-                for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+                const int32_t t = tt + ttt;
+                if(t < t_max)
                 {
-                    const int32_t t = tt + ttt;
                     // load the K[b][t][g][h|0][:] row into registers
                     load_v<data_t, data_vec_t>(
                         cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
                 }
             }
-            // Each block computes different B value
-            compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+        }
+#pragma unroll n_loop_unroll_tail
+        for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+        {
+            compute_t qk_acc = 0;
+            const int32_t t  = tt + ttt;
+            if(t < t_max)
+            {
+                ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+                    q_thread, k_loads[ttt], qk_acc);
+                qk_acc *= qk_scale;
 
-            // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-            // Split T across wavefronts in a block, unroll loads to expose more
-            // parallelism.
+                qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+                max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
 
-            // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-            for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-                tt += wavefronts_per_block * n_loop_unroll_tail)
-            {
-                if(lane_active_for_io)
+                // write accumulated sums to smem.
+                if(lane_idx == 0)
                 {
-#pragma unroll n_loop_unroll_tail
-                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                    {
-                        const int32_t t = tt + ttt;
-                        if(t < t_max)
-                        {
-                            // load the K[b][t][g][h|0][:] row into registers
-                            load_v<data_t, data_vec_t>(
-                                cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                        }
-                        // Each block computes different B value
-                        compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
-
-                        // Compute S[T_MAX] = for i in range(T): S[t] = sum(Q[d] * K[t, d])
-                        // Split T across wavefronts in a block, unroll loads to expose more
-                        // parallelism.
-
-                        // write accumulated sums to smem.
-                        if(lane_idx == 0)
-                        {
-                            smem[t] = qk_acc;
-                        }
-                    }
+                    smem[t] = qk_acc;
                 }
             }
+        }
+    }
 
-            // Use shared reduction to compute max and compute softmax on shared memory.
-            // write max acc
-            if(lane_idx == 0)
-            {
-                smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
-            }
-            __syncthreads();
-            if(lane_idx < wavefronts_per_block)
-            {
-                max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
-            }
-            // shared across all threads in block
-            max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
-
-            // each wavefront computes partial sum of exp.
-            compute_t softmax_denominator = 0.0f;
-            for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
-            {
-                softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
-            }
-            softmax_denominator =
-                wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+    // Use shared reduction to compute max and compute softmax on shared memory.
+    // write max acc
+    if(lane_idx == 0)
+    {
+        smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+    }
+    __syncthreads();
+    if(lane_idx < wavefronts_per_block)
+    {
+        max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
+    }
+    // shared across all threads in block
+    max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
 
-            if(lane_idx == 0)
-            {
-                smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
-            }
-            __syncthreads();
+    // each wavefront computes partial sum of exp.
+    compute_t softmax_denominator = 0.0f;
+    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    {
+        softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+    }
+    softmax_denominator =
+        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
 
-            // now, compute sum of exp(x - max(x)) over all intermediate results.
-            softmax_denominator = 0.0;
-            if(lane_idx < wavefronts_per_block)
-            {
-                softmax_denominator = smem[KV_M_MAX + lane_idx];
-            }
-            softmax_denominator =
-                wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+    if(lane_idx == 0)
+    {
+        smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
+    }
+    __syncthreads();
 
-            const compute_t softmax_scale_factor = 1. / softmax_denominator;
-            // now, compute the normalization across all threads.
-            for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
-            {
-                smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
-            }
-            __syncthreads();
+    // now, compute sum of exp(x - max(x)) over all intermediate results.
+    softmax_denominator = 0.0;
+    if(lane_idx < wavefronts_per_block)
+    {
+        softmax_denominator = smem[KV_M_MAX + lane_idx];
+    }
+    softmax_denominator =
+        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
 
-            // Split T across wavefronts in a block
-            // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-            // outputs are of size float[D]
+    const compute_t softmax_scale_factor = 1. / softmax_denominator;
+    // now, compute the normalization across all threads.
+    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    {
+        smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+    }
+    __syncthreads();
 
-            compute_t ps[n_loop_unroll] = {};
-            compute_vec_t o_acc         = 0;
-            if(lane_active_for_io)
-            {
-                for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
-                {
-#pragma unroll n_loop_unroll
-                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-                    {
-                        const int32_t t = tt + ttt;
-                        // load the V[b][t][g][h|0][:] row into registers, reusing K
-                        // register storage
-                        load_v<data_t, data_vec_t>(
-                            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                        ps[ttt] = smem[t];
-                    }
+    // Split T across wavefronts in a block
+    // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+    // outputs are of size float[D]
 
+    compute_t ps[n_loop_unroll] = {};
+    compute_vec_t o_acc         = 0;
+    if(lane_active_for_io)
+    {
+        for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
+        {
 #pragma unroll n_loop_unroll
-                    for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-                    {
-                        o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-                    }
-                }
-
-                for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-                    tt += wavefronts_per_block * n_loop_unroll_tail)
-                {
-#pragma unroll n_loop_unroll_tail
-                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                    {
-                        const int32_t t = tt + ttt;
-                        if(t < t_max)
-                        {
-                            // load the V[b][t][g][h|0][:] row into registers, reusing K
-                            // register storage
-                            load_v<data_t, data_vec_t>(
-                                cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                            ps[ttt] = smem[t];
-                        }
-                    }
-
-#pragma unroll n_loop_unroll_tail
-                    for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-                    {
-                        const int32_t t = tt + ttt;
-                        if(t < t_max)
-                        {
-                            o_acc =
-                                scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-                        }
-                    }
-                }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                // load the V[b][t][g][h|0][:] row into registers, reusing K register
+                // storage
+                load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                ps[ttt] = smem[t];
             }
-            // now, each thread has partial sums. Write to smem and get accumulated
-            // results back.
-            __syncthreads();
 
-            // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) *
-            // threadsPerBlock
-            if(lane_active_for_io)
+#pragma unroll n_loop_unroll
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
             {
-                store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+                o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
             }
+        }
 
-            __syncthreads();
-            // sum up partial D rows from other wavefronts
-            if(wavefront_idx == 0 && lane_active_for_io)
+        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+            tt += wavefronts_per_block * n_loop_unroll_tail)
+        {
+#pragma unroll n_loop_unroll_tail
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
             {
-                union
-                {
-                    compute_vec_t vec = 0;
-                    compute_t arr[vec_size];
-                } r;
-                for(int32_t w = 0; w < wavefronts_per_block; ++w)
+                const int32_t t = tt + ttt;
+                if(t < t_max)
                 {
-                    compute_vec_t partial_r;
-                    load_v<compute_t, compute_vec_t>(
-                        smem, w * threads_per_wavefront + lane_idx, &partial_r);
-                    r.vec += partial_r;
+                    // load the V[b][t][g][h|0][:] row into registers, reusing K register
+                    // storage
+                    load_v<data_t, data_vec_t>(
+                        cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                    ps[ttt] = smem[t];
                 }
-                // elementwise convert from compute_t result to data_t out to be written
-                union
-                {
-                    data_vec_t vec;
-                    data_t arr[vec_size];
-                } bf_r;
-#pragma unroll
-                for(int32_t i = 0; i < vec_size; ++i)
+            }
+
+#pragma unroll n_loop_unroll_tail
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
                 {
-                    bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+                    o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
                 }
-                // write output row O[b][m][g][h][:]
-                data_t* __restrict__ o_ = O + XQO_base_offset;
-                store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
             }
         }
+    }
+    // now, each thread has partial sums. Write to smem and get accumulated
+    // results back.
+    __syncthreads();
 
-    } // namespace
+    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+    if(lane_active_for_io)
+    {
+        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+    }
 
-    namespace ck {
-    namespace tensor_operation {
-    namespace device {
-    template <typename scalar_t>
-    struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
+    __syncthreads();
+    // sum up partial D rows from other wavefronts
+    if(wavefront_idx == 0 && lane_active_for_io)
     {
-        using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-        struct Argument : public BaseArgument
+        union
         {
-            const scalar_t* __restrict__ XQ;
-            const scalar_t* __restrict__ cache_K;
-            const scalar_t* __restrict__ cache_V;
-            scalar_t* __restrict__ O;
-            const int32_t* __restrict__ seq_kv_lens;
-            const ptrdiff_t XQ_stride_b;
-            const ptrdiff_t XQ_stride_m;
-            const ptrdiff_t XQ_stride_g;
-            const ptrdiff_t XQ_stride_h;
-            const ptrdiff_t K_stride_b;
-            const ptrdiff_t K_stride_m;
-            const ptrdiff_t K_stride_g;
-            const ptrdiff_t K_stride_h;
-            const int32_t Q_size_m;
-            const int32_t Q_size_g;
-            const int32_t Q_size_h;
-            const int32_t Q_size_k;
-            const int32_t K_size_m;
-            const bool multiquery;
-            const float qk_scale;
-
-            const dim3 grid_dim;
-            const dim3 block_dim;
-            const size_t lds_bytes;
-
-            Argument(const scalar_t* __restrict__ XQ,
-                     const scalar_t* __restrict__ cache_K,
-                     const scalar_t* __restrict__ cache_V,
-                     scalar_t* __restrict__ O,
-                     const int32_t* __restrict__ seq_kv_lens,
-                     const ptrdiff_t XQ_stride_b,
-                     const ptrdiff_t XQ_stride_m,
-                     const ptrdiff_t XQ_stride_g,
-                     const ptrdiff_t XQ_stride_h,
-                     const ptrdiff_t K_stride_b,
-                     const ptrdiff_t K_stride_m,
-                     const ptrdiff_t K_stride_g,
-                     const ptrdiff_t K_stride_h,
-                     const int32_t Q_size_m,
-                     const int32_t Q_size_g,
-                     const int32_t Q_size_h,
-                     const int32_t Q_size_k,
-                     const int32_t K_size_m,
-                     const bool multiquery,
-                     const float qk_scale,
-                     const dim3 grid_dim,
-                     const dim3 block_dim,
-                     const size_t lds_bytes)
-                : XQ(XQ),
-                  cache_K(cache_K),
-                  cache_V(cache_V),
-                  O(O),
-                  seq_kv_lens(seq_kv_lens),
-                  XQ_stride_b(XQ_stride_b),
-                  XQ_stride_m(XQ_stride_m),
-                  XQ_stride_g(XQ_stride_g),
-                  XQ_stride_h(XQ_stride_h),
-                  K_stride_b(K_stride_b),
-                  K_stride_m(K_stride_m),
-                  K_stride_g(K_stride_g),
-                  K_stride_h(K_stride_h),
-                  Q_size_m(Q_size_m),
-                  Q_size_g(Q_size_g),
-                  Q_size_h(Q_size_h),
-                  Q_size_k(Q_size_k),
-                  K_size_m(K_size_m),
-                  multiquery(multiquery),
-                  qk_scale(qk_scale),
-                  grid_dim(grid_dim),
-                  block_dim(block_dim),
-                  lds_bytes(lds_bytes)
-            {
-            }
-        };
+            compute_vec_t vec = 0;
+            compute_t arr[vec_size];
+        } r;
+        for(int32_t w = 0; w < wavefronts_per_block; ++w)
+        {
+            compute_vec_t partial_r;
+            load_v<compute_t, compute_vec_t>(
+                smem, w * threads_per_wavefront + lane_idx, &partial_r);
+            r.vec += partial_r;
+        }
+        // elementwise convert from compute_t result to data_t out to be written
+        union
+        {
+            data_vec_t vec;
+            data_t arr[vec_size];
+        } bf_r;
+#pragma unroll
+        for(int32_t i = 0; i < vec_size; ++i)
+        {
+            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+        }
+        // write output row O[b][m][g][h][:]
+        data_t* __restrict__ o_ = O + XQO_base_offset;
+        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+    }
+}
 
-        struct Invoker : public BaseInvoker
+} // namespace
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename scalar_t>
+struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
+{
+    using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+    struct Argument : public BaseArgument
+    {
+        const scalar_t* __restrict__ XQ;
+        const scalar_t* __restrict__ cache_K;
+        const scalar_t* __restrict__ cache_V;
+        scalar_t* __restrict__ O;
+        const int32_t* __restrict__ seq_kv_lens;
+        const ptrdiff_t XQ_stride_b;
+        const ptrdiff_t XQ_stride_m;
+        const ptrdiff_t XQ_stride_g;
+        const ptrdiff_t XQ_stride_h;
+        const ptrdiff_t K_stride_b;
+        const ptrdiff_t K_stride_m;
+        const ptrdiff_t K_stride_g;
+        const ptrdiff_t K_stride_h;
+        const int32_t Q_size_m;
+        const int32_t Q_size_g;
+        const int32_t Q_size_h;
+        const int32_t Q_size_k;
+        const int32_t K_size_m;
+        const bool multiquery;
+        const float qk_scale;
+
+        const dim3 grid_dim;
+        const dim3 block_dim;
+        const size_t lds_bytes;
+
+        Argument(const scalar_t* __restrict__ XQ,
+                 const scalar_t* __restrict__ cache_K,
+                 const scalar_t* __restrict__ cache_V,
+                 scalar_t* __restrict__ O,
+                 const int32_t* __restrict__ seq_kv_lens,
+                 const ptrdiff_t XQ_stride_b,
+                 const ptrdiff_t XQ_stride_m,
+                 const ptrdiff_t XQ_stride_g,
+                 const ptrdiff_t XQ_stride_h,
+                 const ptrdiff_t K_stride_b,
+                 const ptrdiff_t K_stride_m,
+                 const ptrdiff_t K_stride_g,
+                 const ptrdiff_t K_stride_h,
+                 const int32_t Q_size_m,
+                 const int32_t Q_size_g,
+                 const int32_t Q_size_h,
+                 const int32_t Q_size_k,
+                 const int32_t K_size_m,
+                 const bool multiquery,
+                 const float qk_scale,
+                 const dim3 grid_dim,
+                 const dim3 block_dim,
+                 const size_t lds_bytes)
+            : XQ(XQ),
+              cache_K(cache_K),
+              cache_V(cache_V),
+              O(O),
+              seq_kv_lens(seq_kv_lens),
+              XQ_stride_b(XQ_stride_b),
+              XQ_stride_m(XQ_stride_m),
+              XQ_stride_g(XQ_stride_g),
+              XQ_stride_h(XQ_stride_h),
+              K_stride_b(K_stride_b),
+              K_stride_m(K_stride_m),
+              K_stride_g(K_stride_g),
+              K_stride_h(K_stride_h),
+              Q_size_m(Q_size_m),
+              Q_size_g(Q_size_g),
+              Q_size_h(Q_size_h),
+              Q_size_k(Q_size_k),
+              K_size_m(K_size_m),
+              multiquery(multiquery),
+              qk_scale(qk_scale),
+              grid_dim(grid_dim),
+              block_dim(block_dim),
+              lds_bytes(lds_bytes)
         {
-            using Argument = DeviceOp::Argument;
-            float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-            {
-                auto threads_per_wavefront = arg.block_dim.x;
+        }
+    };
 
-                auto Q_size_k_alignment_necessary = 0;
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            auto threads_per_wavefront = arg.block_dim.x;
 
-                for(auto vec_size : {4, 2, 1})
-                {
-                    if(arg.Q_size_k <= vec_size * threads_per_wavefront)
-                    {
-                        Q_size_k_alignment_necessary = vec_size;
-                    }
-                };
+            auto Q_size_k_alignment_necessary = 0;
 
-                if(!Q_size_k_alignment_necessary)
+            for(auto vec_size : {4, 2, 1})
+            {
+                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
                 {
-                    throw std::runtime_error("Unsupported Q_size_k");
+                    Q_size_k_alignment_necessary = vec_size;
                 }
+            }
 
-                if(arg.Q_size_k % Q_size_k_alignment_necessary)
-                {
-                    throw std::runtime_error("Unsupported alignment for Q_size_k");
-                }
+            if(!Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported Q_size_k");
+            }
 
-                return launch_and_time_kernel(
-                    stream_config,
-                    Q_size_k_alignment_necessary == 4
-                        ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-                    : Q_size_k_alignment_necessary == 2
-                        ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                    : Q_size_k_alignment_necessary == 1
-                        ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-                        : nullptr,
-                    arg.grid_dim,
-                    arg.block_dim,
-                    arg.lds_bytes,
-                    arg.XQ,
-                    arg.cache_K,
-                    arg.cache_V,
-                    arg.O,
-                    arg.seq_kv_lens,
-                    arg.XQ_stride_b,
-                    arg.XQ_stride_m,
-                    arg.XQ_stride_g,
-                    arg.XQ_stride_h,
-                    arg.K_stride_b,
-                    arg.K_stride_m,
-                    arg.K_stride_g,
-                    arg.K_stride_h,
-                    arg.Q_size_m,
-                    arg.Q_size_g,
-                    arg.Q_size_h,
-                    arg.Q_size_k,
-                    arg.K_size_m,
-                    arg.multiquery,
-                    arg.qk_scale);
+            if(arg.Q_size_k % Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported alignment for Q_size_k");
             }
-        };
+
+            return launch_and_time_kernel(
+                stream_config,
+                Q_size_k_alignment_necessary == 4
+                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+                : Q_size_k_alignment_necessary == 2
+                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                : Q_size_k_alignment_necessary == 1
+                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+                    : nullptr,
+                arg.grid_dim,
+                arg.block_dim,
+                arg.lds_bytes,
+                arg.XQ,
+                arg.cache_K,
+                arg.cache_V,
+                arg.O,
+                arg.seq_kv_lens,
+                arg.XQ_stride_b,
+                arg.XQ_stride_m,
+                arg.XQ_stride_g,
+                arg.XQ_stride_h,
+                arg.K_stride_b,
+                arg.K_stride_m,
+                arg.K_stride_g,
+                arg.K_stride_h,
+                arg.Q_size_m,
+                arg.Q_size_g,
+                arg.Q_size_h,
+                arg.Q_size_k,
+                arg.K_size_m,
+                arg.multiquery,
+                arg.qk_scale);
+        }
     };
-    } // namespace device
-    } // namespace tensor_operation
-    } // namespace ck
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
\ No newline at end of file

From 59d6e4f3bcb1b8aae5ec4b702d888c24ff6a1835 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Dec 2023 20:04:30 +0000
Subject: [PATCH 265/641] Fix to use long_index_t as offset types in the kernel

---
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 50 +++++++++++--------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 29c13540a..e0a3f14a0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -425,11 +425,11 @@ struct FmhaFwdKernel
         const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
         const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
 
-        index_t batch_offset_q    = 0;
-        index_t batch_offset_k    = 0;
-        index_t batch_offset_v    = 0;
-        index_t batch_offset_bias = 0;
-        index_t batch_offset_o    = 0;
+        long_index_t batch_offset_q    = 0;
+        long_index_t batch_offset_k    = 0;
+        long_index_t batch_offset_v    = 0;
+        long_index_t batch_offset_bias = 0;
+        long_index_t batch_offset_o    = 0;
 
         if constexpr(kIsGroupMode)
         {
@@ -437,25 +437,26 @@ struct FmhaFwdKernel
             const index_t query_start = kargs.seqstart_q_ptr[i_batch];
             const index_t key_start   = kargs.seqstart_k_ptr[i_batch];
 
-            batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
+            batch_offset_q = static_cast<long_index_t>(query_start) * kargs.stride_q;
+            batch_offset_k = static_cast<long_index_t>(key_start) * kargs.stride_k;
             if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
             {
-                batch_offset_v = key_start * kargs.stride_v;
+                batch_offset_v = static_cast<long_index_t>(key_start) * kargs.stride_v;
             }
             else
             {
-                batch_offset_v = key_start;
+                batch_offset_v = static_cast<long_index_t>(key_start);
             }
             if constexpr(kSupportsBias)
             {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+                batch_offset_bias =
+                    static_cast<long_index_t>(query_start) * kargs.stride_bias + key_start;
             }
             else
             {
-                batch_offset_bias = key_start;
+                batch_offset_bias = static_cast<long_index_t>(key_start);
             }
-            batch_offset_o = query_start * kargs.stride_o;
+            batch_offset_o = static_cast<long_index_t>(query_start) * kargs.stride_o;
 
             // get real # queries & # keys under group mode
             const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
@@ -476,21 +477,28 @@ struct FmhaFwdKernel
         }
         else
         {
-            batch_offset_q = i_batch * kargs.batch_stride_q;
-            batch_offset_k = i_batch * kargs.batch_stride_k;
-            batch_offset_v = i_batch * kargs.batch_stride_v;
+            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+            batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+            batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
             if constexpr(kSupportsBias)
             {
-                batch_offset_bias = i_batch * kargs.batch_stride_bias;
+                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
             }
-            batch_offset_o = i_batch * kargs.batch_stride_o;
+            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
         }
 
         // for simplicity, batch stride we just modify the pointer
-        const QDataType* q_ptr = kargs.q_ptr + i_nhead * kargs.nhead_stride_q + batch_offset_q;
-        const KDataType* k_ptr = kargs.k_ptr + i_nhead * kargs.nhead_stride_k + batch_offset_k;
-        const VDataType* v_ptr = kargs.v_ptr + i_nhead * kargs.nhead_stride_v + batch_offset_v;
-        ODataType* o_ptr       = kargs.o_ptr + i_nhead * kargs.nhead_stride_o + batch_offset_o;
+        const QDataType* q_ptr = kargs.q_ptr +
+                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                 batch_offset_q;
+        const KDataType* k_ptr = kargs.k_ptr +
+                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_k +
+                                 batch_offset_k;
+        const VDataType* v_ptr = kargs.v_ptr +
+                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_v +
+                                 batch_offset_v;
+        ODataType* o_ptr = kargs.o_ptr + static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                           batch_offset_o;
 
         // Q/K/V DRAM and DRAM window
         const auto q_dram = [&]() {

From 08e598145251eda3f17313f48d7cb010c81b0291 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 6 Dec 2023 20:09:10 +0000
Subject: [PATCH 266/641] Update the two benchmark scripts for ck-tiled to more
 be aligned with those of the non-tiled ones

---
 .../benchmarks/benchmark_mem_eff_attention_ck_tiled.py |  1 +
 .../benchmark_mem_eff_attn_decoder_ck_tiled.py         | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
index e9381e88a..ee0c111ff 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
@@ -159,6 +159,7 @@ def product_dict(**kwargs):
                 ##{"dropout_p": 0.3},
                 {"attn_bias_cfg": (torch.Tensor, False)},
                 ##{"attn_bias_cfg": (torch.Tensor, True)},
+                {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
                 ##{"dtype": torch.bfloat16},
                 ##{"dtype": torch.float},
             ]
diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
index 0aea1b7c4..1e8239ace 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
@@ -119,6 +119,7 @@ def mem_eff_attention_decoder(
     torch.manual_seed(42)
     k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
     K = 128
+    ##dtype = torch.bfloat16
     dtype = torch.float16
     q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
     if multiquery:
@@ -132,9 +133,10 @@ def mem_eff_attention_decoder(
         k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
         v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
 
-    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+    bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
         q_seqlen=[1] * B,
         kv_seqlen=k_seqlen,
+        kv_padding=padding,
     )
 
     sub_label = f"{B}batch-{k_seqlen[0]}keys-{n_heads}heads"
@@ -151,6 +153,8 @@ def mem_eff_attention_decoder(
 
         fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
 
+        mem_size = get_memory_traffic(fw_op, q, k, v, bias)
+
         yield benchmark.Timer(
             stmt=f"fn(q, k, v, attn_bias)",
             globals={
@@ -162,7 +166,7 @@ def mem_eff_attention_decoder(
             },
             label="attention",
             description=fw_op.NAME,
-            sub_label=sub_label,
+            sub_label=f"{sub_label}_{mem_size//1024}k",
             num_threads=num_threads,
         )
 
@@ -176,7 +180,7 @@ def mem_eff_attention_decoder(
             },
             label="cuda graphed attention",
             description=fw_op.NAME,
-            sub_label=sub_label,
+            sub_label=f"{sub_label}_{mem_size//1024}k",
             num_threads=num_threads,
         )
 

From 3616eceb90b5f87550e32d29c3c87f28409c7803 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:52:26 -0500
Subject: [PATCH 267/641] use clang-format-10

---
 .../hip_fmha/attention_forward_decoder.cpp     | 18 +++++++++---------
 .../hip_fmha/ck_attention_forward_decoder.h    | 10 +++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 76fd3228c..99de91741 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -258,15 +258,15 @@ int main(int argc, char** argv)
                       << std::endl;
             return 0;
         }
-        const int32_t n_keys                 = std::stoi(args[0]);
-        const int32_t padding                = std::stoi(args[1]);
-        const int32_t batch_size             = std::stoi(args[2]);
-        const int32_t n_heads                = std::stoi(args[3]);
-        const int32_t n_groups               = 1;
-        const int32_t multiquery             = (args[4] == "mq");
-        const auto dtype                     = (args[5] == "f32")   ? torch::kFloat32
-                                               : (args[5] == "f16") ? torch::kFloat16
-                                                                    : torch::kBFloat16;
+        const int32_t n_keys     = std::stoi(args[0]);
+        const int32_t padding    = std::stoi(args[1]);
+        const int32_t batch_size = std::stoi(args[2]);
+        const int32_t n_heads    = std::stoi(args[3]);
+        const int32_t n_groups   = 1;
+        const int32_t multiquery = (args[4] == "mq");
+        const auto dtype         = (args[5] == "f32")
+                               ? torch::kFloat32
+                               : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
         const int32_t n_wavefronts_per_block = std::stoi(args[6]);
 
         const int32_t dim_per_head = 4 * kThreadsPerWavefront;
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 381bb4ed8..08d0dbe06 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -528,11 +528,11 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
                 stream_config,
                 Q_size_k_alignment_necessary == 4
                     ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-                : Q_size_k_alignment_necessary == 2
-                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                : Q_size_k_alignment_necessary == 1
-                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-                    : nullptr,
+                    : Q_size_k_alignment_necessary == 2
+                          ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                          : Q_size_k_alignment_necessary == 1
+                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+                                : nullptr,
                 arg.grid_dim,
                 arg.block_dim,
                 arg.lds_bytes,

From 26f9b58b8bf5bad18d9fd066c8154b7a12bd1393 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 7 Dec 2023 08:41:21 +0000
Subject: [PATCH 268/641] Reduce static_cast<long_index_t> in the kernel

---
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index e0a3f14a0..5b6f54a22 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -434,29 +434,29 @@ struct FmhaFwdKernel
         if constexpr(kIsGroupMode)
         {
             // get starting offset for each batch
-            const index_t query_start = kargs.seqstart_q_ptr[i_batch];
-            const index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+            const long_index_t query_start =
+                static_cast<long_index_t>(kargs.seqstart_q_ptr[i_batch]);
+            const long_index_t key_start = static_cast<long_index_t>(kargs.seqstart_k_ptr[i_batch]);
 
-            batch_offset_q = static_cast<long_index_t>(query_start) * kargs.stride_q;
-            batch_offset_k = static_cast<long_index_t>(key_start) * kargs.stride_k;
+            batch_offset_q = query_start * kargs.stride_q;
+            batch_offset_k = key_start * kargs.stride_k;
             if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
             {
-                batch_offset_v = static_cast<long_index_t>(key_start) * kargs.stride_v;
+                batch_offset_v = key_start * kargs.stride_v;
             }
             else
             {
-                batch_offset_v = static_cast<long_index_t>(key_start);
+                batch_offset_v = key_start;
             }
             if constexpr(kSupportsBias)
             {
-                batch_offset_bias =
-                    static_cast<long_index_t>(query_start) * kargs.stride_bias + key_start;
+                batch_offset_bias = query_start * kargs.stride_bias + key_start;
             }
             else
             {
-                batch_offset_bias = static_cast<long_index_t>(key_start);
+                batch_offset_bias = key_start;
             }
-            batch_offset_o = static_cast<long_index_t>(query_start) * kargs.stride_o;
+            batch_offset_o = query_start * kargs.stride_o;
 
             // get real # queries & # keys under group mode
             const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;

From 09233e3e3aa6c1f5237157dee7b7e9de4d4c181b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 7 Dec 2023 13:20:15 +0000
Subject: [PATCH 269/641] Add nhead_ratio_qk kernel argument to support mqa/gqa

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 18 ++++----
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 43 +++++++++++++------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 10 +++--
 3 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 336228f6f..3003fa404 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -171,10 +171,11 @@ struct batched_infer_masktype_attnbias_dispatched
                     param.k_ptr,
                     param.v_ptr,
                     param.out_ptr,
-                    param.M,  // seqlen_q
-                    param.N,  // seqlen_k
-                    param.K,  // hdim_q
-                    param.Kv, // hdim_v
+                    param.M,              // seqlen_q
+                    param.N,              // seqlen_k
+                    param.K,              // hdim_q
+                    param.Kv,             // hdim_v
+                    param.Hq / param.Hkv, // nhead_ratio_qk
                     param.scale,
                     param.q_strides[1], // q, k, v, out tensor seq-dim stride
                     param.k_strides[1],
@@ -197,10 +198,11 @@ struct batched_infer_masktype_attnbias_dispatched
                     param.k_ptr,
                     param.v_ptr,
                     param.out_ptr,
-                    param.M,  // seqlen_q
-                    param.N,  // seqlen_k
-                    param.K,  // hdim_q
-                    param.Kv, // hdim_v
+                    param.M,              // seqlen_q
+                    param.N,              // seqlen_k
+                    param.K,              // hdim_q
+                    param.Kv,             // hdim_v
+                    param.Hq / param.Hkv, // nhead_ratio_qk
                     param.scale,
                     param.q_strides[1], // q, k, v, out tensor seq-dim stride
                     param.k_strides[1],
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 5b6f54a22..534c2c588 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -60,6 +60,7 @@ struct FmhaFwdKernel
                                        ck::index_t seqlen_k_,
                                        ck::index_t hdim_q_,
                                        ck::index_t hdim_v_,
+                                       ck::index_t nhead_ratio_qk_,
                                        float scale_,
                                        ck::index_t stride_q_,
                                        ck::index_t stride_k_,
@@ -77,6 +78,7 @@ struct FmhaFwdKernel
               seqlen_k{seqlen_k_},
               hdim_q{hdim_q_},
               hdim_v{hdim_v_},
+              nhead_ratio_qk{nhead_ratio_qk_},
               scale{scale_},
               stride_q{stride_q_},
               stride_k{stride_k_},
@@ -98,6 +100,7 @@ struct FmhaFwdKernel
         ck::index_t seqlen_k;
         ck::index_t hdim_q;
         ck::index_t hdim_v;
+        ck::index_t nhead_ratio_qk;
 
         float scale;
 
@@ -135,6 +138,7 @@ struct FmhaFwdKernel
                                           ck::index_t seqlen_k_,
                                           ck::index_t hdim_q_,
                                           ck::index_t hdim_v_,
+                                          ck::index_t nhead_ratio_qk_,
                                           float scale_,
                                           ck::index_t stride_q_,
                                           ck::index_t stride_k_,
@@ -156,6 +160,7 @@ struct FmhaFwdKernel
                           seqlen_k_,
                           hdim_q_,
                           hdim_v_,
+                          nhead_ratio_qk_,
                           scale_,
                           stride_q_,
                           stride_k_,
@@ -190,6 +195,7 @@ struct FmhaFwdKernel
                                           const void* seqlen_k_ptr_,
                                           ck::index_t hdim_q_,
                                           ck::index_t hdim_v_,
+                                          ck::index_t nhead_ratio_qk_,
                                           float scale_,
                                           ck::index_t stride_q_,
                                           ck::index_t stride_k_,
@@ -207,6 +213,7 @@ struct FmhaFwdKernel
                           -1 /* will be updated inside the kernel */,
                           hdim_q_,
                           hdim_v_,
+                          nhead_ratio_qk_,
                           scale_,
                           stride_q_,
                           stride_k_,
@@ -239,6 +246,7 @@ struct FmhaFwdKernel
                                                                       ck::index_t seqlen_k,
                                                                       ck::index_t hdim_q,
                                                                       ck::index_t hdim_v,
+                                                                      ck::index_t nhead_ratio_qk,
                                                                       float scale,
                                                                       ck::index_t stride_q,
                                                                       ck::index_t stride_k,
@@ -254,10 +262,10 @@ struct FmhaFwdKernel
                                                                       ck::index_t batch_stride_o)
     {
         return Kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
-                     seqlen_k,       hdim_q,         hdim_v,         scale,          stride_q,
-                     stride_k,       stride_v,       stride_o,       nhead_stride_q, nhead_stride_k,
-                     nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k, batch_stride_v,
-                     batch_stride_o};
+                     seqlen_k,       hdim_q,         hdim_v,         nhead_ratio_qk, scale,
+                     stride_q,       stride_k,       stride_v,       stride_o,       nhead_stride_q,
+                     nhead_stride_k, nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k,
+                     batch_stride_v, batch_stride_o};
     }
 
     template <bool Cond = !kIsGroupMode && kSupportsBias>
@@ -270,6 +278,7 @@ struct FmhaFwdKernel
               ck::index_t seqlen_k,
               ck::index_t hdim_q,
               ck::index_t hdim_v,
+              ck::index_t nhead_ratio_qk,
               float scale,
               ck::index_t stride_q,
               ck::index_t stride_k,
@@ -287,10 +296,10 @@ struct FmhaFwdKernel
                   std::nullopt)
     {
         Kargs kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
-                    seqlen_k,       hdim_q,         hdim_v,         scale,          stride_q,
-                    stride_k,       stride_v,       stride_o,       nhead_stride_q, nhead_stride_k,
-                    nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k, batch_stride_v,
-                    batch_stride_o};
+                    seqlen_k,       hdim_q,         hdim_v,         nhead_ratio_qk, scale,
+                    stride_q,       stride_k,       stride_v,       stride_o,       nhead_stride_q,
+                    nhead_stride_k, nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k,
+                    batch_stride_v, batch_stride_o};
 
         if(bias.has_value())
         {
@@ -313,6 +322,7 @@ struct FmhaFwdKernel
                                                                       const void* seqlen_k_ptr,
                                                                       ck::index_t hdim_q,
                                                                       ck::index_t hdim_v,
+                                                                      ck::index_t nhead_ratio_qk,
                                                                       float scale,
                                                                       ck::index_t stride_q,
                                                                       ck::index_t stride_k,
@@ -332,6 +342,7 @@ struct FmhaFwdKernel
                      seqlen_k_ptr,
                      hdim_q,
                      hdim_v,
+                     nhead_ratio_qk,
                      scale,
                      stride_q,
                      stride_k,
@@ -354,6 +365,7 @@ struct FmhaFwdKernel
               const void* seqlen_k_ptr,
               ck::index_t hdim_q,
               ck::index_t hdim_v,
+              ck::index_t nhead_ratio_qk,
               float scale,
               ck::index_t stride_q,
               ck::index_t stride_k,
@@ -374,6 +386,7 @@ struct FmhaFwdKernel
                     seqlen_k_ptr,
                     hdim_q,
                     hdim_v,
+                    nhead_ratio_qk,
                     scale,
                     stride_q,
                     stride_k,
@@ -491,12 +504,14 @@ struct FmhaFwdKernel
         const QDataType* q_ptr = kargs.q_ptr +
                                  static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
                                  batch_offset_q;
-        const KDataType* k_ptr = kargs.k_ptr +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_k +
-                                 batch_offset_k;
-        const VDataType* v_ptr = kargs.v_ptr +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_v +
-                                 batch_offset_v;
+        const KDataType* k_ptr =
+            kargs.k_ptr +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+            batch_offset_k;
+        const VDataType* v_ptr =
+            kargs.v_ptr +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
+            batch_offset_v;
         ODataType* o_ptr = kargs.o_ptr + static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
                            batch_offset_o;
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 89b4348f3..abd0b9fc6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -143,8 +143,9 @@ struct grouped_infer_masktype_attnbias_dispatched
                     param.seqstart_q_dev_ptr,
                     param.seqstart_k_dev_ptr,
                     param.seqlen_k_dev_ptr,
-                    param.K,  // hdim_q
-                    param.Kv, // hdim_v
+                    param.K,              // hdim_q
+                    param.Kv,             // hdim_v
+                    param.Hq / param.Hkv, // nhead_ratio_qk
                     param.scale,
                     param.q_strides[0], // q, k, v, out tensor seq-dim stride
                     param.k_strides[0],
@@ -166,8 +167,9 @@ struct grouped_infer_masktype_attnbias_dispatched
                     param.seqstart_q_dev_ptr,
                     param.seqstart_k_dev_ptr,
                     param.seqlen_k_dev_ptr,
-                    param.K,  // hdim_q
-                    param.Kv, // hdim_v
+                    param.K,              // hdim_q
+                    param.Kv,             // hdim_v
+                    param.Hq / param.Hkv, // nhead_ratio_qk
                     param.scale,
                     param.q_strides[0], // q, k, v, out tensor seq-dim stride
                     param.k_strides[0],

From 9030f5606e4363c823a27fdf289faa929fe78b18 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 7 Dec 2023 17:14:08 +0000
Subject: [PATCH 270/641] Update test_forward_ck_tiled.py to synchronize
 ref_attention from test_mem_eff_attention_ck.py

---
 tests/test_forward_ck_tiled.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index 3c5419525..c8a60dee3 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -209,6 +209,26 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
 
 
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+    if q.ndim == 5:
+        def attn_bias_group(group: int):
+            if isinstance(attn_bias, torch.Tensor):
+                return attn_bias[:, group]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    attn_bias._bias[:, group]
+                )
+            return attn_bias
+
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), dtype=dtype
+                )
+                for g in range(q.shape[2])
+            ],
+            dim=2,
+        )
+     
     if q.ndim == 4:
         assert p == 0.0
         return ref_attention_bmhk(q, k, v, attn_bias=attn_bias, dtype=dtype)
@@ -582,10 +602,14 @@ def test_forward(
     if not (k == kv and (kv == 64 or kv == 128)):
         pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
 
+    if kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
         )
+
     if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
         pytest.skip("BMK incompatible with this bias")
 
@@ -637,3 +661,4 @@ def test_forward(
         atol=op.ERROR_ATOL[dtype],
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
+

From 1f5952e5c39e7952d1c263b0277984ffcedde5ce Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:25:00 -0500
Subject: [PATCH 271/641] fix ck_decoder op to run again with bmghk inputs

---
 xformers/ops/fmha/ck_decoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index ff4a0fd60..daa4689b8 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -19,6 +19,7 @@ class FwOp(AttentionFwOpBase):
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {type(None), BlockDiagonalCausalWithOffsetPaddedKeysMask}
     SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
+    SUPPORTS_BMGHK = True
     NAME = "ck_decoderF"
 
     @classmethod

From 0cbacb2eb3db0474b897973808523276c918b994 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 8 Dec 2023 00:30:23 +0000
Subject: [PATCH 272/641] Add test_mqa_forward and some change to ref_attention

---
 tests/test_forward_ck_tiled.py | 124 +++++++++++++++++++++++++++++----
 1 file changed, 112 insertions(+), 12 deletions(-)

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index c8a60dee3..6a7512f22 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -207,31 +207,43 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
 )
 
-
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
-    if q.ndim == 5:
-        def attn_bias_group(group: int):
+    if q.ndim == 4:
+        B, M, Hq, K = q.shape
+        _, N, Hkv, Kv = v.shape
+        nhead_ratio_qk = Hq // Hkv
+
+        def attn_bias_head(head: int):
             if isinstance(attn_bias, torch.Tensor):
-                return attn_bias[:, group]
+                assert attn_bias.ndim == 4
+                _, H, _, _ = attn_bias.shape        
+                assert H == Hq
+                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+                return bias_bghmn[:, :, head]
             if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                assert attn_bias._bias.ndim == 4
+                _, H, _, _ = attn_bias._bias.shape        
+                assert H == Hq
+                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+
                 return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    attn_bias._bias[:, group]
+                    bias_bghmn[:, :, head]
                 )
             return attn_bias
 
+        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
+
         return torch.stack(
             [
                 ref_attention_bmhk(
-                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), dtype=dtype
+                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
                 )
-                for g in range(q.shape[2])
+                for h in range(q_bmghk.shape[3])
             ],
-            dim=2,
-        )
+            dim=3,
+        ).reshape((B, M, Hq, Kv))
      
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias, dtype=dtype)
+    assert q.ndim == 3
     if dtype is None:
         dtype = torch.float32
     q = q.to(dtype=dtype)
@@ -662,3 +674,91 @@ def test_forward(
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
+@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
+@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
+@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
+@pytest.mark.parametrize("batches", [100, 64, 1])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize("op", [fmha.ck.FwOp])
+def test_mqa_forward(
+    op,
+    attn_bias_type,
+    dtype, 
+    batches: int, 
+    seqlen_kv: int, 
+    seqlen_q: int, 
+    nhead_kv: int, 
+    nhead_q: int, 
+    hdim_v: int, 
+    hdim_k: int, 
+):
+    B = batches
+    M = seqlen_q
+    N = seqlen_kv
+    Hq = nhead_q
+    Hkv = nhead_kv
+    K = hdim_k
+    Kv = hdim_v
+
+    print("Hq=", Hq, "Hkv=", Hkv)
+
+    device = torch.device("cuda")
+
+    if dtype is torch.bfloat16:
+        pytest.skip("bfloat16 is currently not supported by ck-tiled!")
+
+    if not (K == Kv and (Kv == 64 or Kv == 128)):
+        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
+
+    if Kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+
+    scale = 3
+    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
+    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
+    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
+
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=Hq,
+            q_len=M,
+            kv_len=N,
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+            fmt="BMHK",
+            op=op,
+        )
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+

From a74d5f39a7d285de63ce825a47bf91cfe6715e68 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 2 Nov 2023 12:50:40 -0400
Subject: [PATCH 273/641] implement boilerplate which creates an xformers op
 and binds it with a backend implementation

```
$> python -m xformers.info
...
memory_efficient_attention.ck_splitKF:             available
...
```
---
 xformers/csrc/attention/attention.cpp         |   2 +
 .../hip_fmha/attention_decoder_splitk.cpp     |   8 +
 .../hip_fmha/attention_forward_splitk.cpp     |  53 ++++++
 xformers/ops/__init__.py                      |   2 +
 xformers/ops/fmha/__init__.py                 |   5 +
 xformers/ops/fmha/forward_splitk.py           | 151 ++++++++++++++++++
 6 files changed, 221 insertions(+)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
 create mode 100644 xformers/ops/fmha/forward_splitk.py

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index d243a0616..5f802e56a 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -48,6 +48,8 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
           "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, Tensor value, Tensor seq_positions, float scale, int split_k) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
diff --git a/xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp
new file mode 100644
index 000000000..e535ddb7e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp
@@ -0,0 +1,8 @@
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <c10/cuda/CUDAStream.h>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/inner_product.hpp>
+#include <torch/library.h>
+
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
new file mode 100644
index 000000000..dc859c2ee
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -0,0 +1,53 @@
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/library.h>
+
+namespace {
+
+at::Tensor efficient_attention_forward_decoder_splitk_ck(
+    const at::Tensor& XQ, // [B, 1, H, D]
+    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
+    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
+    const at::Tensor& seq_positions, // [B]
+    double qk_scale,
+    int64_t split_k) {
+    
+    at::OptionalDeviceGuard guard(XQ.device());
+
+    TORCH_CHECK(XQ.is_cuda());
+    TORCH_CHECK(cache_K.is_cuda());
+    TORCH_CHECK(cache_V.is_cuda());
+
+    TORCH_CHECK(seq_positions.is_cuda());
+
+    auto B = XQ.size(0);
+    auto M = XQ.size(1);
+    auto G = XQ.size(2);
+    auto H = XQ.size(3);
+    auto K_q = XQ.size(4);
+    auto M_k = cache_K.size(1);
+
+    constexpr auto BLOCK_M = 16;
+
+    auto M_ceil = (M + BLOCK_M - 1) / BLOCK_M * BLOCK_M;
+
+    const auto options = at::TensorOptions()
+                             .dtype(XQ.dtype())
+                             .layout(at::kStrided)
+                             .device(XQ.device())
+                             .requires_grad(false);
+
+    auto O = at::empty({B * G * H, split_k, M_ceil, K_q}, options);
+    auto metadata = at::empty({B * G * H, 2, split_k, M_ceil}, options);
+
+    return O;
+}
+}
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_splitk_ck"),
+      TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
+}
\ No newline at end of file
diff --git a/xformers/ops/__init__.py b/xformers/ops/__init__.py
index d14468c2b..e0e12df4b 100644
--- a/xformers/ops/__init__.py
+++ b/xformers/ops/__init__.py
@@ -18,6 +18,7 @@
     MemoryEfficientAttentionTritonFwdFlashBwOp,
     TritonFlashAttentionOp,
     MemoryEfficientAttentionCkOp,
+    MemoryEfficientAttentionSplitKCkOp,
     memory_efficient_attention,
     memory_efficient_attention_backward,
     memory_efficient_attention_forward,
@@ -75,6 +76,7 @@ def masked_matmul(a, b, mask=None):
     "MemoryEfficientAttentionOp",
     "MemoryEfficientAttentionTritonFwdFlashBwOp",
     "MemoryEfficientAttentionCkOp",
+    "MemoryEfficientAttentionSplitKCkOp",
     "memory_efficient_attention_backward",
     "memory_efficient_attention_forward",
     "memory_efficient_attention_forward_requires_grad",
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 9c2733f07..bfb524ece 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -7,7 +7,11 @@
 
 import torch
 
+<<<<<<< HEAD
 from . import cutlass, decoder, flash, small_k, triton, ck, ck_decoder
+=======
+from . import cutlass, decoder, flash, small_k, triton, ck, forward_splitk
+>>>>>>> d7ba109 (implement boilerplate which creates an xformers op and binds it with a backend implementation)
 from .attn_bias import AttentionBias, BlockDiagonalMask, LowerTriangularMask
 from .common import (
     AttentionBwOpBase,
@@ -31,6 +35,7 @@
 TritonFlashAttentionOp = (triton.FwOp, triton.BwOp)
 MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp) 
 MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
+MemoryEfficientAttentionSplitKCkOp = (forward_splitk.FwOp, ck.BwOp) 
 
 class _fMHA(torch.autograd.Function):
     @staticmethod
diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/forward_splitk.py
new file mode 100644
index 000000000..ff85d5f2d
--- /dev/null
+++ b/xformers/ops/fmha/forward_splitk.py
@@ -0,0 +1,151 @@
+import torch
+from typing import Any, List, Set, Tuple, Optional
+from xformers.ops.common import get_xformers_operator, register_operator
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
+from xformers.ops.fmha.common import AttentionFwOpBase, Context, Inputs, check_lastdim_alignment_stride1
+
+@register_operator
+class FwOp(AttentionFwOpBase):
+    
+    OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_splitk_ck")
+    SUPPORTED_DEVICES = {"cuda"}
+    SUPPORTED_DTYPES = {
+        torch.half,
+        torch.bfloat16,
+    }  # Those are dtypes of Q. In the quantized case K/V has dtype int32
+    SUPPORTED_MAX_K = 128
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    }
+    SUPPORTS_DROPOUT = False
+    SUPPORTS_CUSTOM_SCALE = True
+    SUPPORTS_BMGHK = True
+    NAME = "ck_splitKF"
+
+    SPLIT_K: Optional[int] = None
+    BLOCK_M = 16
+    BLOCK_N = 64
+
+    NUM_GROUPS = 1  # Default quantization is row-wise
+
+    @classmethod
+    def shape_not_supported_reasons(
+        cls, Mq: int, Mkv: int, K: int, Kv: int
+    ) -> List[str]:
+        reasons = super().shape_not_supported_reasons(Mq, Mkv, K, Kv)
+        if K not in {16, 32, 64, 128}:
+            reasons.append(f"Embed dim {K} not supported")
+        return reasons
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(FwOp, cls).not_supported_reasons(d)
+        check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
+        if d.key.dtype != torch.int32:
+            check_lastdim_alignment_stride1(reasons, "key", d.key, 8)
+            check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
+        if cls.OPERATOR is None:
+            reasons.append("triton is not available")
+        if d.device.type == "cuda":
+            # Has only been tested on 8.0 / 9.0.
+            if torch.cuda.get_device_capability(d.device) < (7, 0):
+                reasons.append(
+                    "requires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
+                )
+
+        q_len = d.query.shape[1]
+        if isinstance(d.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            seqinfo = d.attn_bias.q_seqinfo
+            if q_len != seqinfo.seqstart_py[-1]:
+                reasons.append(
+                    f"Expected total {seqinfo.seqstart_py[-1]} queries not {q_len}"
+                )
+            q_len = seqinfo.min_seqlen
+            if q_len != seqinfo.max_seqlen:
+                reasons.append(
+                    "Variable query len is not supported in the presence of causal mask."
+                )
+
+        if d.key.ndim in [4, 5] and d.key.shape[-2] != 1:
+            if d.key.stride(-2) == 0 and d.value.stride(-2) == 0 and q_len > 1:
+                reasons.append("multiquery is only supported with query seqlen=1")
+
+        if d.attn_bias is not None and q_len > 1:
+            reasons.append(
+                "query with seqlen > 1 is not supported in the presence of causal mask"
+            )
+        return reasons
+
+    @classmethod
+    def get_split_k(cls, B: int, H: int, Mk: int) -> int:
+        """Heuristic for the number of splits"""
+        bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
+        split_k = max(Mk, 1024) // bh
+        max_chunk_size = 64 if Mk <= 512 and bh <= 64 else 128
+        while split_k > 0 and Mk / split_k < max_chunk_size:
+            split_k = split_k // 2
+        split_k = min(split_k, 64)
+        split_k = max(split_k, 1)
+        return split_k
+
+    @classmethod
+    def apply(
+        cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        attn_bias = inp.attn_bias
+        seq_len = None
+        q, k, v = inp.get_qkv_in_bmghk()
+        
+        if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+            seq_len = attn_bias.k_seqinfo.seqlen
+            B = len(seq_len)
+            G, H, Kq = q.shape[-3:]
+            Kkv = v.shape[-1]
+
+            # assume kv has been padded
+            q = q.reshape(B, -1, G, H, Kq)
+            k = k.reshape(B, -1, G, H, Kkv)
+            v = v.reshape(B, -1, G, H, Kkv)
+        
+        mqa_swap_seqlen_head = False
+        if k.shape[3] > 1 and k.stride(3) == 0 and v.stride(3) == 0:
+            mqa_swap_seqlen_head = True
+            assert q.shape[1] == 1
+            q = q.transpose(1, 3)
+            k = k[:, :, :, :1]
+            v = v[:, :, :, :1]
+
+        Lk = k.shape[-1]
+
+        B, Mk, G, H, Kkv = k.shape
+        B, M, G, H, Kq = q.shape
+        assert Lk == Kq, f"Keys have head dim {Lk} but queries have head dim {Kq}"
+
+        BLOCK_M = cls.BLOCK_M
+        BLOCK_N = cls.BLOCK_N
+        if cls.SPLIT_K is not None:
+            split_k = cls.SPLIT_K
+        else:
+            # Use heuristics
+            split_k = cls.get_split_k(B, H, Mk)
+
+        M_ceil = (M + BLOCK_M - 1) // BLOCK_M * BLOCK_M
+
+        # o_splitk = torch.empty(
+        #     [B * G * H, split_k, M_ceil, Kq], dtype=torch.float32, device=q.device
+        # )
+        # metadata = torch.empty(
+        #     [B * G * H, 2, split_k, M_ceil], dtype=torch.float32, device=q.device
+        # )
+
+        if inp.scale is not None:
+            qk_scale = inp.scale
+        else:
+            qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32))
+
+        out = cls.OPERATOR(query=q, key=k, value=v, seq_positions=seq_len, scale=qk_scale, split_k=split_k)
+
+        return out, None
+

From 21fbf99e801ad502bfc63ebf6cbdd0a73b463c81 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 8 Nov 2023 19:27:19 -0500
Subject: [PATCH 274/641] add a (failing) test to verify splitk algorithm
 correctness

---
 tests/test_mem_eff_attention.py               | 69 +++++++++++++++++++
 .../hip_fmha/attention_forward_splitk.cpp     | 13 +++-
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index ae3f051b6..7c86cd4e9 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -282,6 +282,75 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
+def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2) -> torch.Tensor:
+    assert q.ndim == 3
+
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    if scale is None:
+        scale = torch.rsqrt(q.shape[-1])
+    q = q * scale
+
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+
+    split_config = { "dim": -1, "split_size_or_sections": k.size(-1) // split_k}
+    k_split = torch.split(k, **split_config)
+    v_split = torch.split(v, **split_config)
+    attn_bias_split = torch.split(attn_bias_tensor, **split_config)
+
+    def compute_attention_split(q, k_slice, v_slice, attn_bias_slice):
+        p_slice = q @ k_slice.transpose(-2, -1)
+        p_slice += attn_bias_slice
+        m = p_slice.max(dim = -1)
+        s = torch.exp(p_slice - m[:, :, None])
+        l = torch.sum(s, dim = -1)
+        attn_slice = s @ v_slice
+        return {
+            "attn_slice": attn_slice,
+            "row_max": m,
+            "row_lse": l, 
+        }
+    
+    slices = map(lambda k, v, b: compute_attention_split(q, k, v, b), 
+        zip(k_split, v_split, attn_bias_split))
+    slices = list(slices)
+    out = torch.zero_like(q)
+
+    m_current_max = slices[0]["row_max"]
+    l_current_sum = torch.zero_like(slices[0]["row_lse"])
+
+    for s in slices:
+        (attn_slice, m, l) = s.values()
+        m_new = torch.max(m, m_current_max)
+        pick_new = m < m_current_max
+        pick_our = torch.logical_not(pick_new)
+
+        alpha = torch.exp(-torch.abs(m - m_current_max))
+
+        out = (pick_our * out + pick_new * attn_slice) * alpha
+        l_current_sum = (pick_our * l_current_sum + pick_new * l) * alpha
+        m_current_max = m_new
+    
+    out /= l_current_sum
+    return out
+
+
 def _rand_seqlens(
     r: random.Random,
     bs: int,
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index dc859c2ee..237fcaca2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -30,18 +30,27 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck(
     auto M_k = cache_K.size(1);
 
     constexpr auto BLOCK_M = 16;
-
     auto M_ceil = (M + BLOCK_M - 1) / BLOCK_M * BLOCK_M;
 
+    constexpr auto kThreadsPerWarp = 64;
+    constexpr auto kWarpsPerBlock = 2; // original uses 2 warps
+
     const auto options = at::TensorOptions()
                              .dtype(XQ.dtype())
                              .layout(at::kStrided)
                              .device(XQ.device())
                              .requires_grad(false);
 
-    auto O = at::empty({B * G * H, split_k, M_ceil, K_q}, options);
+    auto O_splitk = at::empty({B * G * H, split_k, M_ceil, K_q}, options);
     auto metadata = at::empty({B * G * H, 2, split_k, M_ceil}, options);
 
+    dim3 attention_grid = {static_cast<uint32_t>(M / BLOCK_M), static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(split_k)};
+    dim3 reduce_grid = {static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(M)};
+
+    dim3 threads = {kThreadsPerWarp * kWarpsPerBlock};
+    
+    auto O = at::empty_like(XQ);
+
     return O;
 }
 }

From e4921b1baae3e9ffebc783fa147a4e7566df3e4e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 8 Nov 2023 21:01:15 -0500
Subject: [PATCH 275/641] make the splitk reference test pass

---
 tests/test_mem_eff_attention_ck.py | 157 +++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 1b4286c01..b42dc7aaa 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -283,6 +283,122 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
+def ref_attention_splitk_bmhk(q, k, v, attn_bias, scale=None, split_k=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention_splitk(T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2) -> torch.Tensor:
+    if q.ndim == 4:
+        return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k)
+    assert q.ndim == 3
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    if scale is None:
+        scale = q.shape[-1] ** -.5
+    assert not q.isnan().any()
+    q = q * scale
+    assert not q.isnan().any()
+
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+
+    split_size = k.size(-2) // split_k
+    split_config = { "dim": -2, "split_size_or_sections": split_size}
+    k_split = torch.split(k, **split_config)
+    v_split = torch.split(v, **split_config)
+    attn_bias_split = torch.split(attn_bias_tensor, dim=-1, split_size_or_sections=split_size)
+    
+    def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
+        assert not q_whole.isnan().any(), "q_whole is nan"
+        assert not k_slice.isnan().any(), "k_slice is nan"
+        p_slice = q_whole @ k_slice.transpose(-2, -1)
+        assert not p_slice.isnan().any(), "p_slice is nan"
+        assert not p_slice.isinf().any(), "p_slice is inf"
+        p_slice += attn_bias_slice
+        assert not p_slice.isnan().any(), "p_slice is nan after bias add"
+        m = torch.max(p_slice, dim = -1, keepdim=True).values
+        assert not m.isnan().any(), "m is nan"
+        p_slice_scaled = p_slice - m
+        p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
+        assert not p_slice_scaled.isnan().any(), f"p_slice_scaled is nan: {p_slice_scaled.isnan().sum()} of {p_slice_scaled.numel()} values"
+        s = torch.exp(p_slice_scaled)
+        assert s.shape == p_slice.shape
+        assert not s.isnan().any(), f"s is nan: {s.isnan().sum()} of {s.numel()} values"
+        l = torch.sum(s, dim = -1)
+        assert not l.isnan().any(), "l is nan"
+        attn_slice = s @ v_slice
+        assert not attn_slice.isnan().any(), "attn_slice is nan"
+        return {
+            "attn_slice": attn_slice,
+            "row_max": m,
+            "row_lse": l, 
+        }
+    
+    splits = list(zip(k_split, v_split, attn_bias_split))
+
+    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]), 
+        splits))
+    out = torch.zeros_like(q)
+
+    assert(not slices[0]["attn_slice"].isnan().any())
+
+    # reduce out over split-k slices
+
+    m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
+    l_current_sum = torch.zeros_like(slices[0]["row_lse"]).unsqueeze(-1)
+
+    for s in slices:
+        attn_slice = s["attn_slice"]
+        m = s["row_max"]
+        l = s["row_lse"].unsqueeze(-1)
+        m_new = torch.max(m, m_current_max)
+        assert not m_new.isnan().any(), "m_new is nan"
+        pick_new = m < m_current_max
+        pick_our = torch.logical_not(pick_new)
+
+        log_alpha = -torch.abs(m - m_current_max)
+        log_alpha[log_alpha.isnan()] = 0
+        alpha = torch.exp(log_alpha)
+        assert not alpha.isnan().any(), "alpha is nan"
+        out = out + attn_slice + (pick_our * out + pick_new * attn_slice) * (torch.sub(alpha, 1))
+        assert not out.isnan().any(), "out acc is nan"
+        l_current_sum = l_current_sum + l + (pick_our * l_current_sum + pick_new * l) * (torch.sub(alpha, 1))
+        assert not l_current_sum.isnan().any(), "l acc is nan"
+        m_current_max = m_new
+    out /= l_current_sum
+    assert not out.isnan().any(), "final out is nan"
+    return out
+
 def _rand_seqlens(
     r: random.Random,
     bs: int,
@@ -1639,6 +1755,47 @@ def test_attn_bias_padded() -> None:
         rtol=fmha.ck.FwOp.ERROR_RTOL[torch.float16],
     )
 
+@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "nomq")
+@pytest.mark.parametrize("n_heads", [1, 16, 32])
+@pytest.mark.parametrize("padding", [32, 4096])
+@pytest.mark.parametrize("bsz", [1, 8])
+@pytest.mark.parametrize("dtype", ["f16"])
+@pytest.mark.parametrize("split_k", [1, 2])
+def test_splitk_reference(
+    multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
+):
+    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
+    torch.manual_seed(1)
+    d = 256
+    k_shape = (1, bsz * padding, n_heads, d)
+    # TODO: support 2 kv heads etc.
+    k = torch.rand(k_shape, dtype=dtype_).cuda()
+    k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
+    v = torch.rand(k_shape, dtype=dtype_).cuda()
+    q = torch.rand((1, bsz, n_heads, d), dtype=dtype_).cuda()
+    causal_diagonal = torch.tensor(  # TODO: make unnecessary
+        [i - 1 for i in k_seqlen], dtype=torch.int32
+    ).cuda()
+
+    if multiquery:
+        k = k[:, :, :1].expand(k_shape)
+        v = v[:, :, :1].expand(k_shape)
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=[1] * bsz,
+        kv_seqlen=k_seqlen,
+        causal_diagonal=causal_diagonal,
+        kv_padding=padding,
+    )
+    ref_out = ref_attention(q, k, v, attn_bias)
+    splitk_out = ref_attention_splitk(q, k, v, attn_bias, None, split_k=split_k)
+    assert_allclose(
+        ref_out,
+        splitk_out,
+        atol=fmha.ck.FwOp.ERROR_ATOL[dtype_],
+        rtol=fmha.ck.FwOp.ERROR_RTOL[dtype_],
+    )
+
 
 def _kv_heads_label(kv_heads: Optional[int]) -> str:
     if kv_heads is None:

From 656e85cad423c2345ec6645be02b604f7bf249a5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 8 Nov 2023 21:13:14 -0500
Subject: [PATCH 276/641] use keepdim instead of reshaping in the test

---
 tests/test_mem_eff_attention_ck.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index b42dc7aaa..b26e46710 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -354,7 +354,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         s = torch.exp(p_slice_scaled)
         assert s.shape == p_slice.shape
         assert not s.isnan().any(), f"s is nan: {s.isnan().sum()} of {s.numel()} values"
-        l = torch.sum(s, dim = -1)
+        l = torch.sum(s, dim=-1, keepdim=True)
         assert not l.isnan().any(), "l is nan"
         attn_slice = s @ v_slice
         assert not attn_slice.isnan().any(), "attn_slice is nan"
@@ -375,12 +375,12 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     # reduce out over split-k slices
 
     m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
-    l_current_sum = torch.zeros_like(slices[0]["row_lse"]).unsqueeze(-1)
+    l_current_sum = torch.zeros_like(slices[0]["row_lse"])
 
     for s in slices:
         attn_slice = s["attn_slice"]
         m = s["row_max"]
-        l = s["row_lse"].unsqueeze(-1)
+        l = s["row_lse"]
         m_new = torch.max(m, m_current_max)
         assert not m_new.isnan().any(), "m_new is nan"
         pick_new = m < m_current_max

From 8722b1c979475d4ffeeb514dd49d41111cc484d0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 8 Nov 2023 21:42:49 -0500
Subject: [PATCH 277/641] remove redundant assert

---
 tests/test_mem_eff_attention_ck.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index b26e46710..301351f3d 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -370,8 +370,6 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         splits))
     out = torch.zeros_like(q)
 
-    assert(not slices[0]["attn_slice"].isnan().any())
-
     # reduce out over split-k slices
 
     m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))

From 30f34a6a70f90c7a85c4fc5d3c7a4677c65f9e95 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 14:02:08 -0500
Subject: [PATCH 278/641] clean up test

---
 tests/test_mem_eff_attention_ck.py | 34 ++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 301351f3d..e34451842 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1758,7 +1758,7 @@ def test_attn_bias_padded() -> None:
 @pytest.mark.parametrize("padding", [32, 4096])
 @pytest.mark.parametrize("bsz", [1, 8])
 @pytest.mark.parametrize("dtype", ["f16"])
-@pytest.mark.parametrize("split_k", [1, 2])
+@pytest.mark.parametrize("split_k", [1, 2, 4])
 def test_splitk_reference(
     multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
 ):
@@ -1766,7 +1766,6 @@ def test_splitk_reference(
     torch.manual_seed(1)
     d = 256
     k_shape = (1, bsz * padding, n_heads, d)
-    # TODO: support 2 kv heads etc.
     k = torch.rand(k_shape, dtype=dtype_).cuda()
     k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
     v = torch.rand(k_shape, dtype=dtype_).cuda()
@@ -1874,6 +1873,37 @@ def test_decoder(
         rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
     )
 
+def _kv_heads_label(kv_heads: Optional[int]) -> str:
+    if kv_heads is None:
+        return ""
+    if kv_heads == 1:
+        return "mq"
+    return f"gqa{kv_heads}"
+
+
+@pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp])
+@pytest.mark.parametrize("dtype", ["f16"])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("n_heads", [16])
+@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
+def test_triton_splitk_decoder(
+    op,
+    kv_heads: Optional[int],
+    n_heads: int,
+    padding: int,
+    bsz: int,
+    dtype: str,
+) -> None:
+    # no quantized impl compared to cuda
+    test_decoder(
+        op,
+        kv_heads=kv_heads,
+        n_heads=n_heads,
+        padding=padding,
+        bsz=bsz,
+        dtype=dtype,
+    )
+
 
 def test_attn_bias_from_seqlens() -> None:
     bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])

From 5348d38bdfd4e57a1931692ebaa26439f4085efa Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 14:09:21 -0500
Subject: [PATCH 279/641] fix rebase conflict

---
 xformers/ops/fmha/__init__.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index bfb524ece..c186d284b 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -7,11 +7,7 @@
 
 import torch
 
-<<<<<<< HEAD
-from . import cutlass, decoder, flash, small_k, triton, ck, ck_decoder
-=======
-from . import cutlass, decoder, flash, small_k, triton, ck, forward_splitk
->>>>>>> d7ba109 (implement boilerplate which creates an xformers op and binds it with a backend implementation)
+from . import cutlass, decoder, flash, small_k, triton, ck, forward_splitk, ck_decoder
 from .attn_bias import AttentionBias, BlockDiagonalMask, LowerTriangularMask
 from .common import (
     AttentionBwOpBase,

From e0048df2dffdefeb7a27664ef218e54fce723456 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 15 Nov 2023 17:20:07 -0500
Subject: [PATCH 280/641] stash changes

---
 .../hip_fmha/attention_forward_splitk.cpp       | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 237fcaca2..9775a1e0a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -4,6 +4,18 @@
 #include <c10/cuda/CUDAStream.h>
 #include <torch/library.h>
 
+#define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_SWITCH_3(                               \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
 namespace {
 
 at::Tensor efficient_attention_forward_decoder_splitk_ck(
@@ -59,4 +71,7 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_splitk_ck"),
       TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
-}
\ No newline at end of file
+}
+
+#undef AT_DISPATCH_CASE_3
+#undef AT_DISPATCH_SWITCH_3
\ No newline at end of file

From c9a882f87f4e8a6e1136b1bbca0f7a074d538631 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sun, 26 Nov 2023 02:41:47 -0500
Subject: [PATCH 281/641] add an (incorrect) kernel implementation and (failing
 numerically) test

---
 setup.py                                      |   8 +-
 xformers/csrc/attention/attention.cpp         |   2 +-
 .../hip_fmha/attention_forward_splitk.cpp     | 253 ++++++-
 .../ck_attention_forward_decoder_splitk.h     | 710 ++++++++++++++++++
 xformers/ops/fmha/forward_splitk.py           |  49 +-
 5 files changed, 983 insertions(+), 39 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h

diff --git a/setup.py b/setup.py
index 9f21987ad..31391dff1 100644
--- a/setup.py
+++ b/setup.py
@@ -211,13 +211,17 @@ def get_extensions():
 
     source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"), recursive=False)
     
+    source_hip_decoder = [
+        *glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False),
+        *glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_splitk.cpp"), recursive=False)
+    ]
+
     if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_infer_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
     else:
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_backward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"), recursive=False)
@@ -229,6 +233,8 @@ def get_extensions():
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_backward_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances", "ck_fmha_*.cpp"), recursive=False)
         
+    source_hip += source_hip_decoder
+    
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
     cutlass_examples_dir = os.path.join(this_dir, "third_party", "cutlass", "examples")
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 5f802e56a..dbd65072d 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -49,7 +49,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, Tensor value, Tensor seq_positions, float scale, int split_k) -> Tensor"));
+      "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, Tensor value, Tensor? seq_positions, float scale, int split_k) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 9775a1e0a..1dad0fa61 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -4,6 +4,34 @@
 #include <c10/cuda/CUDAStream.h>
 #include <torch/library.h>
 
+#include "ck_attention_forward_decoder_splitk.h"
+
+namespace {
+  constexpr int32_t kThreadsPerWavefront = 64;
+  constexpr int32_t kWavefrontsPerBlock = 16;
+  constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
+}
+
+namespace {
+
+template <typename c10_t>
+struct c10_to_data_t;
+template <>
+struct c10_to_data_t<float> {
+  using type = float;
+};
+
+template <>
+struct c10_to_data_t<c10::Half> {
+  using type = ck::half_t;
+};
+
+template <>
+struct c10_to_data_t<c10::BFloat16> {
+  using type = ck::bhalf_t;
+};
+}
+
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
   AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
   AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
@@ -18,54 +46,211 @@
 
 namespace {
 
-at::Tensor efficient_attention_forward_decoder_splitk_ck(
-    const at::Tensor& XQ, // [B, 1, H, D]
-    const at::Tensor& cache_K, // [B, T_MAX, H or 1, D]
-    const at::Tensor& cache_V, // [B, T_MAX, H or 1, D]
-    const at::Tensor& seq_positions, // [B]
-    double qk_scale,
-    int64_t split_k) {
+// at::Tensor efficient_attention_forward_decoder_splitk_ck(
+//     const at::Tensor& XQ, // [B, 1, G, H, D]
+//     const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+//     const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+//     at::optional<at::Tensor> seq_kv_lens, // [B]
+//     double qk_scale,
+//     at::Tensor& O,
+//     int64_t split_k) {
     
-    at::OptionalDeviceGuard guard(XQ.device());
+//     at::OptionalDeviceGuard guard(XQ.device());
 
-    TORCH_CHECK(XQ.is_cuda());
-    TORCH_CHECK(cache_K.is_cuda());
-    TORCH_CHECK(cache_V.is_cuda());
+//     TORCH_CHECK(XQ.is_cuda());
+//     TORCH_CHECK(cache_K.is_cuda());
+//     TORCH_CHECK(cache_V.is_cuda());
 
-    TORCH_CHECK(seq_positions.is_cuda());
+//     TORCH_CHECK(seq_positions.is_cuda());
 
-    auto B = XQ.size(0);
-    auto M = XQ.size(1);
-    auto G = XQ.size(2);
-    auto H = XQ.size(3);
-    auto K_q = XQ.size(4);
-    auto M_k = cache_K.size(1);
+//     auto M = XQ.size(1);
+//     auto B = XQ.size(0);
+//     auto G = XQ.size(2);
+//     auto H = XQ.size(3);
+//     auto K_q = XQ.size(4);
+//     auto M_k = cache_K.size(1);
 
-    constexpr auto BLOCK_M = 16;
-    auto M_ceil = (M + BLOCK_M - 1) / BLOCK_M * BLOCK_M;
+//     constexpr auto BLOCK_M = 16;
+//     auto M_ceil = (M + BLOCK_M - 1) / BLOCK_M * BLOCK_M;
 
-    constexpr auto kThreadsPerWarp = 64;
-    constexpr auto kWarpsPerBlock = 2; // original uses 2 warps
+//     constexpr auto kThreadsPerWarp = 64;
+//     constexpr auto kWarpsPerBlock = 2; // original uses 2 warps
 
-    const auto options = at::TensorOptions()
-                             .dtype(XQ.dtype())
-                             .layout(at::kStrided)
-                             .device(XQ.device())
-                             .requires_grad(false);
+//     const auto options = at::TensorOptions()
+//                              .dtype(XQ.dtype())
+//                              .layout(at::kStrided)
+//                              .device(XQ.device())
+//                              .requires_grad(false);
 
-    auto O_splitk = at::empty({B * G * H, split_k, M_ceil, K_q}, options);
-    auto metadata = at::empty({B * G * H, 2, split_k, M_ceil}, options);
+//     auto O_splitk = at::empty({B * G * H, split_k, M_ceil, K_q}, options);
+//     auto metadata = at::empty({B * G * H, 2, split_k, M_ceil}, options);
 
-    dim3 attention_grid = {static_cast<uint32_t>(M / BLOCK_M), static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(split_k)};
-    dim3 reduce_grid = {static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(M)};
+//     dim3 attention_grid = {static_cast<uint32_t>(M / BLOCK_M), static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(split_k)};
+//     dim3 reduce_grid = {static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(M)};
 
-    dim3 threads = {kThreadsPerWarp * kWarpsPerBlock};
+//     dim3 threads = {kThreadsPerWarp * kWarpsPerBlock};
     
-    auto O = at::empty_like(XQ);
+//     auto O = at::empty_like(XQ);
+
+//     return O;
+// }
+
+template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock,
+          int32_t KV_M_MAX = 8192,
+          int32_t K_MAX = 256>
+at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale,
+    int64_t split_k,
+    at::Tensor& split_max,
+    at::Tensor& split_sumexp,
+    at::Tensor& split_O,
+    at::Tensor& O) {
+  static_assert(4 * ThreadsPerWavefront == K_MAX, "");
+  static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
+
+  at::OptionalDeviceGuard guard(XQ.device());
+  TORCH_CHECK(XQ.is_cuda());
+  TORCH_CHECK(cache_K.is_cuda());
+  TORCH_CHECK(cache_V.is_cuda());
+
+  TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
+
+  TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
+  TORCH_CHECK(cache_K.size(4) <= K_MAX);
+
+  constexpr auto rank = 5;
 
-    return O;
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+
+  TORCH_CHECK(B <= 1024);
+  TORCH_CHECK(M <= 1024);
+  TORCH_CHECK(H <= 1024);
+
+  dim3 blocks(B * H * M * G, split_k);
+  dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
+
+  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = K_MAX * sizeof(float) *
+      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  const size_t lds_bytes = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      XQ.scalar_type(),
+      "efficient_attention_forward_decoder_splitk_ck",
+      [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t>;
+        auto op = device_op_t{};
+
+        auto XQ_acc =
+            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto K_acc =
+            cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto V_acc =
+            cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto split_O_acc = split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+        auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto seq_acc = seq_kv_lens ?
+            seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data() : nullptr;
+        auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto split_sumexp_acc = split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto arg = device_op_t::Argument(
+            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
+            split_max_acc.data(),
+            split_sumexp_acc.data(),
+            seq_acc,
+            XQ_acc.stride(0),
+            XQ_acc.stride(1),
+            XQ_acc.stride(2),
+            XQ_acc.stride(3),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.stride(3),
+            O_acc.stride(2),
+            XQ_acc.size(1),
+            XQ_acc.size(2),
+            XQ_acc.size(3),
+            XQ_acc.size(4),
+            K_acc.size(1),
+            K_acc.size(3) == 1,
+            qk_scale,
+            split_k,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
+      });
+
+  return O;
+}
+
+#undef AT_DISPATCH_CASE_3
+#undef AT_DISPATCH_SWITCH_3
+
+template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
+at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    int64_t split_k,
+    double qk_scale) {
+  auto O = at::empty_like(XQ);
+  constexpr auto splitk_dim = 0;
+  // auto O_unsqueeze = at::unsqueeze(O, splitk_dim);
+  auto O_splits = at::stack(O, splitk_dim);
+
+  TORCH_CHECK(XQ.dim() == 5);
+  TORCH_CHECK(cache_K.dim() == 5);
+  TORCH_CHECK(cache_V.dim() == 5);
+  TORCH_CHECK(O_splits.dim() == 6);
+
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+
+  auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
+  auto split_sumexp = at::empty_like(split_max);
+
+  efficient_attention_forward_decoder_splitk_ck_out_impl<
+      ThreadsPerWavefront,
+      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
+  return O;
 }
+
+at::Tensor efficient_attention_forward_decoder_splitk_ck(
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale,
+    int64_t split_k) {
+  return efficient_attention_forward_decoder_splitk_ck_impl<
+      kThreadsPerWavefront,
+      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
 }
+} // namespace
+
 
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
new file mode 100644
index 000000000..b093a57f0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -0,0 +1,710 @@
+#pragma once
+
+#include <ck/host_utility/kernel_launch_hip.hpp>
+#include <ck/stream_config.hpp>
+#include <ck/tensor_operation/gpu/device/device_base.hpp>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/inner_product.hpp>
+#include <ck/utility/math.hpp>
+
+namespace ck {
+template <>
+__device__ void inner_product<bhalf_t, bhalf_t, float>(
+    const bhalf_t& a,
+    const bhalf_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+
+__device__ void inner_product<half_t, half_t, float>(
+    const half_t& a,
+    const half_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+__device__ void inner_product<bhalf2_t, bhalf2_t, float>(
+    const bhalf2_t& a,
+    const bhalf2_t& b,
+    float& c) {
+  const vector_type<bhalf_t, 2> a_vector{a};
+  const vector_type<bhalf_t, 2> b_vector{b};
+  ck::static_for<0, 2, 1>{}([&](auto i) {
+    inner_product(
+        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+  });
+}
+
+template <>
+__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
+    const bhalf4_t& a,
+    const bhalf4_t& b,
+    float& c) {
+  const vector_type<bhalf_t, 4> a_vector{a};
+  const vector_type<bhalf_t, 4> b_vector{b};
+  ck::static_for<0, 4, 1>{}([&](auto i) {
+    inner_product(
+        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
+  });
+}
+} // namespace ck
+
+namespace {
+
+template <typename data_t, int32_t vec_size>
+__device__ typename ck::vector_type<float, vec_size>::type scalar_scale_acc(
+    typename ck::vector_type<float, vec_size>::type acc,
+    typename ck::vector_type<data_t, vec_size>::type a,
+    float b) {
+  union {
+    decltype(acc) vec;
+    float arr[vec_size];
+  } acc_u{acc};
+  union {
+    decltype(a) vec;
+    data_t arr[vec_size];
+  } a_u{a};
+
+#pragma unroll
+  for (int32_t i = 0; i < vec_size; ++i) {
+    acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
+  }
+
+  return acc_u.vec;
+}
+
+template <typename F, int32_t n_threads_per_wavefront = 64>
+float __device__ __forceinline__ wavefrontReduce(float val, F f) {
+#pragma unroll
+  for (int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1) {
+    val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
+  }
+  return val;
+}
+
+template <typename TData, typename TDataVec>
+__forceinline__ __device__ void load_v(
+    const TData* __restrict__ data_ptr,
+    int32_t vector_offset,
+    TDataVec* __restrict__ load_to) {
+  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+}
+
+template <typename TData, typename TDataVec>
+__forceinline__ __device__ void store_v(
+    TData* __restrict__ data_ptr,
+    int32_t vector_offset,
+    TDataVec value) {
+  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+}
+
+template<
+typename scalar_t,
+int32_t vec_size = 4,
+typename compute_t = float
+> 
+__global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
+  const scalar_t* __restrict__ O_splits,
+  const compute_t* __restrict__ split_max,
+  const compute_t* __restrict__ split_sumexp,
+  scalar_t* __restrict__ O,
+  int32_t Q_size_m,
+  int32_t Q_size_g,
+  int32_t Q_size_h,
+  int32_t Q_size_k,
+  ptrdiff_t O_stride_split,
+  ptrdiff_t O_stride_b,
+  ptrdiff_t O_stride_m,
+  ptrdiff_t O_stride_g,
+  ptrdiff_t O_stride_h,
+  int32_t split_k
+) {
+  
+  // Each block handles a single batch and head and query and group
+  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+  const int32_t h = blockIdx.x % Q_size_h;
+
+  using data_t = scalar_t;
+  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
+  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+  union {
+    data_vec_t vec;
+    data_t arr[vec_size];
+  } O_split_data;
+  union {
+    compute_vec_t vec;
+    compute_t arr[vec_size];
+  } O_split_compute;
+  union {
+    data_vec_t vec;
+    data_t arr[vec_size];
+  } global_O_data;
+  union {
+    compute_vec_t vec;
+    compute_t arr[vec_size];
+  } global_O_compute;
+
+  global_O_compute.vec = 0;
+
+  const int32_t lane_idx = threadIdx.x;
+  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+  if (!lane_active_for_io) {
+    return;
+  }
+
+  // for s in slices:
+  //       attn_slice = s["attn_slice"]
+  //       m = s["row_max"]
+  //       l = s["row_lse"]
+  //       m_new = torch.max(m, m_current_max)
+  //       assert not m_new.isnan().any(), "m_new is nan"
+  //       pick_new = m < m_current_max
+  //       pick_our = torch.logical_not(pick_new)
+
+  //       log_alpha = -torch.abs(m - m_current_max)
+  //       log_alpha[log_alpha.isnan()] = 0
+  //       alpha = torch.exp(log_alpha)
+  //       assert not alpha.isnan().any(), "alpha is nan"
+  //       out = out + attn_slice + (pick_our * out + pick_new * attn_slice) * (torch.sub(alpha, 1))
+  //       assert not out.isnan().any(), "out acc is nan"
+  //       l_current_sum = l_current_sum + l + (pick_our * l_current_sum + pick_new * l) * (torch.sub(alpha, 1))
+  //       assert not l_current_sum.isnan().any(), "l acc is nan"
+  //       m_current_max = m_new
+  //   out /= l_current_sum
+
+  compute_t new_max = 0;
+  compute_t global_sumexp = 0;
+  compute_t global_max = ck::NumericLimits<compute_t>::Lowest();
+
+  for (size_t split_idx = 0; split_idx < split_k; ++split_idx) {
+    load_v<data_t, data_vec_t>(O_splits 
+                               + b * O_stride_b 
+                               + m * O_stride_m 
+                               + g * O_stride_g 
+                               + h * O_stride_h
+                               + split_idx * O_stride_split, lane_idx, &O_split_data.vec);
+    #pragma unroll
+    for (int32_t i = 0; i < vec_size; ++i) {
+      O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
+    }
+    compute_t local_max = *(split_max + blockIdx.x * split_k + split_idx); 
+    compute_t local_sumexp = *(split_sumexp + blockIdx.x * split_k + split_idx); 
+    new_max = ck::math::max(local_max, global_max);
+    bool pick_new = local_max < global_max;
+    compute_t log_alpha = -std::abs(local_max - global_max);
+    compute_t alpha = ck::math::exp(log_alpha);
+    compute_t pick_current_coef = (1 + (1 - pick_new) * (alpha - 1));
+    compute_t pick_new_coef = (1 + pick_new * (alpha - 1));
+    global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
+    global_O_compute.vec = pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
+    global_max = new_max;
+  }
+  global_O_compute.vec /= global_sumexp;
+  #pragma unroll
+  for (int32_t i = 0; i < vec_size; ++i) {
+    global_O_data.arr[i] = ck::type_convert<data_t>(global_O_compute.arr[i]);
+  }
+  store_v<data_t, data_vec_t>(O + b * O_stride_b + m * O_stride_m + g * O_stride_g + h * O_stride_h, lane_idx, global_O_data.vec);
+}
+
+template <
+    typename scalar_t,
+    int32_t vec_size = 4,
+    int32_t n_loop_unroll = 16,
+    int32_t n_loop_unroll_tail = 2,
+    int32_t KV_M_MAX = 8192,
+    int32_t n_wavefronts_per_block = 16,
+    typename compute_t = float>
+__global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
+    const scalar_t* __restrict__ XQ,
+    const scalar_t* __restrict__ cache_K,
+    const scalar_t* __restrict__ cache_V,
+    scalar_t* __restrict__ O_splits,
+    compute_t* __restrict__ split_max,
+    compute_t* __restrict__ split_sumexp,
+    const int32_t* __restrict__ seq_kv_lens,
+    const ptrdiff_t XQ_stride_b,
+    const ptrdiff_t XQ_stride_m,
+    const ptrdiff_t XQ_stride_g,
+    const ptrdiff_t XQ_stride_h,
+    const ptrdiff_t K_stride_b,
+    const ptrdiff_t K_stride_m,
+    const ptrdiff_t K_stride_g,
+    const ptrdiff_t K_stride_h,
+    const ptrdiff_t O_stride_split,
+    const int32_t Q_size_m,
+    const int32_t Q_size_g,
+    const int32_t Q_size_h,
+    const int32_t Q_size_k,
+    const int32_t K_size_m,
+    const bool multiquery,
+    const float qk_scale,
+    const int32_t split_k) {
+  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
+
+  // Each block handles a single batch and head and query and group
+  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+  const int32_t h = blockIdx.x % Q_size_h;
+  const int32_t split_idx = blockIdx.y;
+
+  // Note: this is decoding case where we attend to current and all previous
+  // tokens.
+  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
+
+  const int32_t lane_idx = threadIdx.x;
+  const int32_t wavefront_idx = threadIdx.y;
+  const int32_t threads_per_wavefront = blockDim.x;
+  const int32_t wavefronts_per_block = blockDim.y;
+  const int32_t threads_per_block =
+      threads_per_wavefront * wavefronts_per_block;
+  const int32_t thread_linear_idx =
+      lane_idx + wavefront_idx * threads_per_wavefront;
+  // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
+  const auto XQO_base_offset =
+      b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
+  const auto* __restrict__ q_ = XQ + XQO_base_offset;
+
+  const auto cache_KV_base_offset =
+      b * K_stride_b + 0 * K_stride_m + g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
+  const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
+  const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
+
+  using data_t = scalar_t;
+  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
+  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+  extern __shared__ __align__(16) compute_t smem[];
+
+  data_vec_t q_thread = 0;
+  // Load Q into registers in all wavefronts.
+  // Each thread handles `vec_size` D dimensions
+  if (lane_active_for_io) {
+    load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
+  }
+
+  compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+
+  // Compute S[0:t_max] = 
+  // ```
+  // for t in range(t_max): 
+  //   S[t] = dot(Q, K[t])
+  // ```
+  // Split the 0:t_max range across wavefronts in a block, 
+  // unroll loads to expose more parallelism.
+  // Reduce the dot product with cross-lane operation;
+  // Q and K[t] are in the registers of threads in a single wavefront.
+
+  data_vec_t k_loads[n_loop_unroll] = {};
+
+  constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
+  const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
+  const int32_t tt_low = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
+  const int32_t tt_high = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * (split_idx + 1);
+  const int32_t dtt_tail = n_wavefronts_per_block * n_loop_unroll_tail;
+  const int32_t tt_tail_low = wavefront_idx * n_loop_unroll_tail + wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
+  const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
+  const int32_t t_max_unroll = (t_max / dtt) * dtt;
+
+  for (auto tt = tt_low; tt < tt_high; tt += dtt) {
+    if (lane_active_for_io) {
+#pragma unroll n_loop_unroll
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the K[b][t][g][h|0][:] row into registers
+        load_v<data_t, data_vec_t>(
+            cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+      }
+    }
+    compute_t qk_accs[n_loop_unroll] = {};
+#pragma unroll n_loop_unroll
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+          q_thread, k_loads[ttt], qk_accs[ttt]);
+      qk_accs[ttt] *= qk_scale;
+
+      qk_accs[ttt] =
+          wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
+      max_qk_acc = ck::math::max(qk_accs[ttt], max_qk_acc);
+    }
+    if (lane_idx == 0) {
+      auto* __restrict__ smem_base = smem + tt;
+#pragma unroll n_loop_unroll
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        smem_base[ttt] = qk_accs[ttt];
+      }
+    }
+  }
+
+  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+  for (auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail) {
+    if (lane_active_for_io) {
+#pragma unroll n_loop_unroll_tail
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the K[b][t][g][h|0][:] row into registers
+          load_v<data_t, data_vec_t>(
+              cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+        }
+      }
+    }
+#pragma unroll n_loop_unroll_tail
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      compute_t qk_acc = 0;
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+            q_thread, k_loads[ttt], qk_acc);
+        qk_acc *= qk_scale;
+
+        qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+        max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
+
+        // write accumulated sums to smem.
+        if (lane_idx == 0) {
+          smem[t] = qk_acc;
+        }
+      }
+    }
+  }
+
+  // Use shared reduction to compute max and compute softmax on shared memory.
+  // write max acc
+  if (lane_idx == 0) {
+    smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+  }
+  __syncthreads();
+  if (lane_idx < wavefronts_per_block) {
+    max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
+  }
+  // shared across all threads in block
+  max_qk_acc =
+      wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+
+  split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
+
+  // each wavefront computes partial sum of exp.
+  compute_t softmax_denominator = 0.0f;
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
+    softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+  }
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](auto a, auto b) { return a + b; });
+
+  if (lane_idx == 0) {
+    smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
+  }
+  __syncthreads();
+
+  // now, compute sum of exp(x - max(x)) over all intermediate results.
+  softmax_denominator = 0.0;
+  if (lane_idx < wavefronts_per_block) {
+    softmax_denominator = smem[KV_M_MAX + lane_idx];
+  }
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](auto a, auto b) { return a + b; });
+
+  split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
+  // or maybe after scaling?
+  
+  const compute_t softmax_scale_factor = 1. / softmax_denominator;
+  // now, compute the normalization across all threads.
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
+    smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+  }
+  __syncthreads();
+
+  // Split T across wavefronts in a block
+  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+  // outputs are of size float[D]
+
+  compute_t ps[n_loop_unroll] = {};
+  compute_vec_t o_acc = 0;
+  if (lane_active_for_io) {
+    for (auto tt = tt_low; tt < tt_high; tt += dtt) {
+#pragma unroll n_loop_unroll
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the V[b][t][g][h|0][:] row into registers, reusing K register
+        // storage
+        load_v<data_t, data_vec_t>(
+            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+        ps[ttt] = smem[t];
+      }
+
+#pragma unroll n_loop_unroll
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        o_acc =
+            scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+      }
+    }
+
+    for (auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail) {
+#pragma unroll n_loop_unroll_tail
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the V[b][t][g][h|0][:] row into registers, reusing K register
+          // storage
+          load_v<data_t, data_vec_t>(
+              cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+          ps[ttt] = smem[t];
+        }
+      }
+
+#pragma unroll n_loop_unroll_tail
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          o_acc =
+              scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+        }
+      }
+    }
+  }
+  // now, each thread has partial sums. Write to smem and get accumulated
+  // results back.
+  __syncthreads();
+
+  // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+  if (lane_active_for_io) {
+    store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  }
+
+  __syncthreads();
+  // sum up partial D rows from other wavefronts
+  if (wavefront_idx == 0 && lane_active_for_io) {
+    union {
+      compute_vec_t vec = 0;
+      compute_t arr[vec_size];
+    } r;
+    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
+      compute_vec_t partial_r;
+      load_v<compute_t, compute_vec_t>(
+          smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      r.vec += partial_r;
+    }
+    // elementwise convert from compute_t result to data_t out to be written
+    union {
+      data_vec_t vec;
+      data_t arr[vec_size];
+    } bf_r;
+#pragma unroll
+    for (int32_t i = 0; i < vec_size; ++i) {
+      bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+    }
+    // write output row O[b][m][g][h][:]
+    data_t* __restrict__ o_ = O_splits + XQO_base_offset + split_idx * O_stride_split;
+    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+  }
+}
+
+} // namespace
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename scalar_t, typename compute_t = float>
+struct FMHADecoderSplitKDeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSplitKDeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    scalar_t* __restrict__ split_O;
+    compute_t* __restrict__ split_max;
+    compute_t* __restrict__ split_sumexp;
+    const int32_t* __restrict__ seq_kv_lens;
+    const ptrdiff_t XQ_stride_b;
+    const ptrdiff_t XQ_stride_m;
+    const ptrdiff_t XQ_stride_g;
+    const ptrdiff_t XQ_stride_h;
+    const ptrdiff_t K_stride_b;
+    const ptrdiff_t K_stride_m;
+    const ptrdiff_t K_stride_g;
+    const ptrdiff_t K_stride_h;
+    const ptrdiff_t O_stride_split;
+    const int32_t Q_size_m;
+    const int32_t Q_size_g;
+    const int32_t Q_size_h;
+    const int32_t Q_size_k;
+    const int32_t K_size_m;
+    const bool multiquery;
+    const float qk_scale;
+    const int32_t split_k;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        scalar_t* __restrict__ split_O,
+        compute_t* __restrict__ split_max,
+        compute_t* __restrict__ split_sumexp,
+        const int32_t* __restrict__ seq_kv_lens,
+        const ptrdiff_t XQ_stride_b,
+        const ptrdiff_t XQ_stride_m,
+        const ptrdiff_t XQ_stride_g,
+        const ptrdiff_t XQ_stride_h,
+        const ptrdiff_t K_stride_b,
+        const ptrdiff_t K_stride_m,
+        const ptrdiff_t K_stride_g,
+        const ptrdiff_t K_stride_h,
+        const ptrdiff_t O_stride_split,
+        const int32_t Q_size_m,
+        const int32_t Q_size_g,
+        const int32_t Q_size_h,
+        const int32_t Q_size_k,
+        const int32_t K_size_m,
+        const bool multiquery,
+        const float qk_scale,
+        const int32_t split_k,
+        // launch params
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          split_O(split_O),
+          split_max(split_max),
+          split_sumexp(split_sumexp),
+          seq_kv_lens(seq_kv_lens),
+          XQ_stride_b(XQ_stride_b),
+          XQ_stride_m(XQ_stride_m),
+          XQ_stride_g(XQ_stride_g),
+          XQ_stride_h(XQ_stride_h),
+          K_stride_b(K_stride_b),
+          K_stride_m(K_stride_m),
+          K_stride_g(K_stride_g),
+          K_stride_h(K_stride_h),
+          O_stride_split(O_stride_split),
+          Q_size_m(Q_size_m),
+          Q_size_g(Q_size_g),
+          Q_size_h(Q_size_h),
+          Q_size_k(Q_size_k),
+          K_size_m(K_size_m),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          split_k(split_k),
+          // launch params
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+
+      auto Q_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+          Q_size_k_alignment_necessary = vec_size;
+        }
+      }
+
+      if (!Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported Q_size_k");
+      }
+
+      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for Q_size_k");
+      }
+
+      float split_attention_result = launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
+              : Q_size_k_alignment_necessary == 2
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
+              : nullptr,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.seq_kv_lens,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.K_stride_b,
+          arg.K_stride_m,
+          arg.K_stride_g,
+          arg.K_stride_h,
+          arg.O_stride_split,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.K_size_m,
+          arg.multiquery,
+          arg.qk_scale,
+          arg.split_k);
+
+        const dim3 reduce_gridsize = {arg.grid_dim.x};
+        const dim3 reduce_blocksize = {arg.block_dim.x};
+        constexpr int32_t reduce_lds_bytes = 0;
+
+        float reduce_result = launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
+              : Q_size_k_alignment_necessary == 2
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 1>
+              : nullptr,
+          reduce_gridsize,
+          reduce_blocksize,
+          reduce_lds_bytes, 
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.O,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.O_stride_split,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.split_k
+        );
+        return split_attention_result + reduce_result;
+    }
+  };
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
\ No newline at end of file
diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/forward_splitk.py
index ff85d5f2d..f67fceb0c 100644
--- a/xformers/ops/fmha/forward_splitk.py
+++ b/xformers/ops/fmha/forward_splitk.py
@@ -13,7 +13,7 @@ class FwOp(AttentionFwOpBase):
         torch.half,
         torch.bfloat16,
     }  # Those are dtypes of Q. In the quantized case K/V has dtype int32
-    SUPPORTED_MAX_K = 128
+    SUPPORTED_MAX_K = 256
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
         type(None),
         BlockDiagonalCausalWithOffsetPaddedKeysMask,
@@ -34,8 +34,8 @@ def shape_not_supported_reasons(
         cls, Mq: int, Mkv: int, K: int, Kv: int
     ) -> List[str]:
         reasons = super().shape_not_supported_reasons(Mq, Mkv, K, Kv)
-        if K not in {16, 32, 64, 128}:
-            reasons.append(f"Embed dim {K} not supported")
+        # if K not in {16, 32, 64, 128}:
+        #     reasons.append(f"Embed dim {K} not supported")
         return reasons
 
     @classmethod
@@ -99,6 +99,8 @@ def apply(
         
         if attn_bias is not None:
             assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+            attn_bias.k_seqinfo.to(k.device)
+            attn_bias.q_seqinfo.to(q.device)
             seq_len = attn_bias.k_seqinfo.seqlen
             B = len(seq_len)
             G, H, Kq = q.shape[-3:]
@@ -145,7 +147,48 @@ def apply(
         else:
             qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32))
 
+        print(f"{q.shape=} {k.shape=} {v.shape=}")
+
         out = cls.OPERATOR(query=q, key=k, value=v, seq_positions=seq_len, scale=qk_scale, split_k=split_k)
 
         return out, None
 
+
+class FwOp_S1(FwOp):
+    SPLIT_K = 1
+    NAME = "ck_splitK1"
+
+
+class FwOp_S2(FwOp):
+    SPLIT_K = 2
+    NAME = "ck_splitK2"
+
+
+class FwOp_S4(FwOp):
+    SPLIT_K = 4
+    NAME = "ck_splitK4"
+
+
+class FwOp_S8(FwOp):
+    SPLIT_K = 8
+    NAME = "ck_splitK8"
+
+
+class FwOp_S16(FwOp):
+    SPLIT_K = 16
+    NAME = "ck_splitK16"
+
+
+class FwOp_S32(FwOp):
+    SPLIT_K = 32
+    NAME = "ck_splitK32"
+
+
+class FwOp_S64(FwOp):
+    SPLIT_K = 64
+    NAME = "ck_splitK64"
+
+
+class FwOp_S128(FwOp):
+    SPLIT_K = 128
+    NAME = "ck_splitK128"

From bc2333331f3f2d95e2eabb350a092616bf320bbf Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 5 Dec 2023 18:30:26 -0500
Subject: [PATCH 282/641] add option to build a standalone runner for splitk
 decoder; debugging numerics in reduction

---
 tests/test_mem_eff_attention_ck.py            |   8 +-
 .../csrc/attention/hip_fmha/CMakeLists.txt    |  51 +++++-
 .../hip_fmha/attention_forward_splitk.cpp     | 149 +++++++++++++++++-
 .../ck_attention_forward_decoder_splitk.h     |  12 +-
 4 files changed, 206 insertions(+), 14 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index e34451842..073adcc4d 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1864,6 +1864,10 @@ def test_decoder(
         q, k, v, attn_bias, op=op
     )
     
+    print(f"{decoder_output.shape=}")
+    nans_in_result = torch.sum(torch.isnan(decoder_output))
+    print(f"{nans_in_result=}")
+
     ref_output = ref_attention(q, k, v, attn_bias, dtype=dtype_)
 
     assert_allclose(
@@ -1881,12 +1885,12 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
     return f"gqa{kv_heads}"
 
 
-@pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp])
+@pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp_S1, fmha.forward_splitk.FwOp_S2])
 @pytest.mark.parametrize("dtype", ["f16"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
 @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
-def test_triton_splitk_decoder(
+def test_splitk_decoder(
     op,
     kv_heads: Optional[int],
     n_heads: int,
diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index a95c68fbe..056bb06bb 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -9,15 +9,17 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
 set(exe_name attention_forward_decoder_main)
+set(splitk_exe_name attention_forward_splitk_decoder_main)
 set(project_root_dir /xformers)
 set(xformers_csrc ${project_root_dir}/xformers/csrc)
 set(sources ${xformers_csrc}/attention/hip_fmha/attention_forward_decoder.hip)
-
+set(splitk_sources ${xformers_csrc}/attention/hip_fmha/attention_forward_splitk.hip)
 set(ck_include ${project_root_dir}/third_party/composable_kernel/include/)
 set(torch_include /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include)
 
-set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
+set_source_files_properties(${sources} ${splitk_sources} PROPERTIES LANGUAGE HIP)
 add_executable(${exe_name} ${sources})
+add_executable(${splitk_exe_name} ${splitk_sources})
 
 find_package(HIP REQUIRED)
 find_package(ROCM REQUIRED PATHS /opt/rocm)
@@ -25,9 +27,9 @@ include(ROCMInstallTargets)
 
 message("HIP_VERSION: ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_PATCH}")
 
-set_target_properties(${exe_name} PROPERTIES LINKER_LANGUAGE CXX)
-set_target_properties(${exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(${exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
+set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES LINKER_LANGUAGE CXX)
+set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
 
 target_compile_options(${exe_name} PUBLIC 
   -fno-gpu-rdc
@@ -36,17 +38,35 @@ target_compile_options(${exe_name} PUBLIC
   >
 )
 
+target_compile_options(${splitk_exe_name} PUBLIC 
+  -fno-gpu-rdc
+  $<$<CONFIG:Debug>:
+  --save-temps
+  >
+)
+
 target_include_directories(${exe_name} PUBLIC 
   ${ck_include}                           # ck includes
   ${torch_include}                        # aten includes
   ${torch_include}/torch/csrc/api/include # torch includes
 )
 
+target_include_directories(${splitk_exe_name} PUBLIC 
+  ${ck_include}                           # ck includes
+  ${torch_include}                        # aten includes
+  ${torch_include}/torch/csrc/api/include # torch includes
+)
+
 target_link_directories(${exe_name} PUBLIC
   /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib # c10, torch
   /opt/rocm/hip/lib
 )
 
+target_link_directories(${splitk_exe_name} PUBLIC
+  /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib # c10, torch
+  /opt/rocm/hip/lib
+)
+
 target_link_libraries(${exe_name} PUBLIC
   c10
   c10_hip
@@ -56,6 +76,16 @@ target_link_libraries(${exe_name} PUBLIC
   amdhip64
 )
 
+
+target_link_libraries(${splitk_exe_name} PUBLIC
+  c10
+  c10_hip
+  torch
+  torch_hip
+  torch_cpu
+  amdhip64
+)
+
 target_compile_definitions(${exe_name} PUBLIC 
   ATTN_FWD_DECODER_MAIN=1
   GLIBCXX_USE_CXX11_ABI=1
@@ -63,8 +93,15 @@ target_compile_definitions(${exe_name} PUBLIC
   USE_ROCM=1
 )
 
+target_compile_definitions(${splitk_exe_name} PUBLIC 
+  ATTN_FWD_SPLITK_DECODER_MAIN=1
+  GLIBCXX_USE_CXX11_ABI=1
+  __HIP_PLATFORM_HCC__=1
+  USE_ROCM=1
+)
+
 include(CMakePrintHelpers)
-cmake_print_properties(TARGETS ${exe_name} PROPERTIES 
+cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES 
   LINK_LIBRARIES 
   LINK_DIRECTORIES 
   INCLUDE_DIRECTORIES 
@@ -73,4 +110,4 @@ cmake_print_properties(TARGETS ${exe_name} PROPERTIES
   SOURCES
   HIP_ARCHITECTURES)
 
-rocm_install(TARGETS ${exe_name})
\ No newline at end of file
+rocm_install(TARGETS ${exe_name} ${splitk_exe_name})
\ No newline at end of file
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 1dad0fa61..f0406b522 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -259,4 +259,151 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 }
 
 #undef AT_DISPATCH_CASE_3
-#undef AT_DISPATCH_SWITCH_3
\ No newline at end of file
+#undef AT_DISPATCH_SWITCH_3
+
+#ifdef ATTN_FWD_SPLITK_DECODER_MAIN
+
+#include <torch/torch.h>
+
+// clang-format off
+
+/*
+
+(1) hipify
+ > pip install -e /xformers
+
+ For obtaining all the library paths needed for compilation below, add `--verbose`.
+ For efficient utilization of CPU cores for compilation use MAX_JOBS env variable.
+
+(2) compile
+ > mkdir build
+ > cd build
+ > cmake /xformers/xformers/csrc/attention/hip_fmha/ \
+       -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+       -D CMAKE_PREFIX_PATH=/opt/rocm \
+       -D CMAKE_BUILD_TYPE=Debug \
+       -D GPU_TARGETS="native" 
+  > make
+
+(3a) run correctness check
+ > ./attention_forward_splitk_decoder_main
+
+(3b) run specific input shape
+ > ./attention_forward_splitk_decoder_main n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block
+*/
+
+// clang-format on
+
+static void do_correctness_check() {
+  const int32_t D = 4 * kThreadsPerWavefront;
+  const int32_t B = 1;
+  const int32_t H = 4;
+  const int32_t G = 1;
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kFloat32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCUDA, 1)
+                     .requires_grad(false);
+  auto int_options = options.dtype(torch::kInt);
+  auto XQ = at::randn({B, 1, G, H, D}, options);
+  auto K = at::randn({B, 4096, G, H, D}, options);
+  auto V = at::randn({B, 4096, G, H, D}, options);
+  auto seq = at::randint(63, 128, {B}, int_options);
+  double qk_scale = 1. / sqrt(D);
+  constexpr auto split_k = 1;
+
+  auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
+      XQ, K, V, seq, qk_scale, split_k);
+  auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl<64, 2>(
+      XQ, K, V, seq, qk_scale, split_k);
+  auto mask = at::isclose(
+      result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+  printf(
+      "Mismatched elements percentage: %.2f\n",
+      1. - percent_match.item<float>());
+}
+
+int main(int argc, char** argv) {
+  if (argc == 1) {
+    do_correctness_check();
+  } else {
+    const auto args = std::vector<std::string>(argv + 1, argv + argc);
+    if (args.size() != 7) {
+      std::cout
+          << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block"
+          << std::endl;
+      return 0;
+    }
+    const int32_t n_keys = std::stoi(args[0]);
+    const int32_t padding = std::stoi(args[1]);
+    const int32_t batch_size = std::stoi(args[2]);
+    const int32_t n_heads = std::stoi(args[3]);
+    const int32_t n_groups = 1;
+    const int32_t multiquery = (args[4] == "mq");
+    const auto dtype = (args[5] == "f32") ? torch::kFloat32
+        : (args[5] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
+    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
+
+    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
+
+    const auto options = torch::TensorOptions()
+                             .dtype(dtype)
+                             .layout(torch::kStrided)
+                             .device(torch::kCUDA, 1)
+                             .requires_grad(false);
+
+    const auto int_options = options.dtype(torch::kInt);
+    const auto Q = at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
+    const auto K = multiquery
+        ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
+              .expand({batch_size, padding, n_groups, n_heads, dim_per_head})
+        : at::rand({batch_size, padding, n_groups, n_heads, dim_per_head}, options);
+    const auto V = at::rand_like(K);
+    auto O = at::empty_like(Q);
+
+    constexpr auto splitk_dim = 0;
+    constexpr auto split_k = 1;
+    auto O_splits = at::stack(O, splitk_dim);
+
+    auto split_max = at::empty({batch_size, padding, n_groups, n_heads, split_k}, options.dtype(at::kFloat));
+    auto split_sumexp = at::empty_like(split_max);
+
+    const auto seq = at::randint(1, n_keys, {batch_size}, int_options);
+    const double qk_scale = 1. / sqrt(dim_per_head);
+    auto call_ptr = decltype(&efficient_attention_forward_decoder_splitk_ck_out_impl<
+                             kThreadsPerWavefront,
+                             kWavefrontsPerBlock>){};
+
+#define SWITCH_CASE_SET_CALLPTR(n)                               \
+  case (n):                                                      \
+    call_ptr = &efficient_attention_forward_decoder_splitk_ck_out_impl< \
+        kThreadsPerWavefront,                                    \
+        (n)>;                                                    \
+    break;
+
+    switch (n_wavefronts_per_block) {
+      SWITCH_CASE_SET_CALLPTR(1);
+      SWITCH_CASE_SET_CALLPTR(2);
+      SWITCH_CASE_SET_CALLPTR(4);
+      SWITCH_CASE_SET_CALLPTR(8);
+      SWITCH_CASE_SET_CALLPTR(16);
+
+      default:
+        call_ptr = nullptr;
+        break;
+    }
+#undef SWITCH_CASE_SET_CALLPTR
+
+    if (call_ptr) {
+      call_ptr(Q, K, V, seq, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
+    } else {
+      std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                << n_wavefronts_per_block << std::endl;
+    }
+  }
+  return 0;
+}
+
+#endif // MAIN
\ No newline at end of file
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index b093a57f0..e7421c7c3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -189,7 +189,7 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
                                + m * O_stride_m 
                                + g * O_stride_g 
                                + h * O_stride_h
-                               + split_idx * O_stride_split, lane_idx, &O_split_data.vec);
+                               + split_idx * O_stride_split, lane_idx, &O_split_data.vec);                       
     #pragma unroll
     for (int32_t i = 0; i < vec_size; ++i) {
       O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
@@ -199,11 +199,16 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
     new_max = ck::math::max(local_max, global_max);
     bool pick_new = local_max < global_max;
     compute_t log_alpha = -std::abs(local_max - global_max);
-    compute_t alpha = ck::math::exp(log_alpha);
+    compute_t alpha = isnan(log_alpha) ? compute_t{1} : ck::math::exp(log_alpha);
+    // assert(!isnan(alpha));
+    // assert(isnan(alpha));
     compute_t pick_current_coef = (1 + (1 - pick_new) * (alpha - 1));
+    // assert(!isnan(pick_current_coef));
     compute_t pick_new_coef = (1 + pick_new * (alpha - 1));
+    // assert(!isnan(pick_new_coef));
     global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
-    global_O_compute.vec = pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
+    // global_O_compute.vec = pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
+    global_O_compute.vec = O_split_compute.vec;
     global_max = new_max;
   }
   global_O_compute.vec /= global_sumexp;
@@ -673,7 +678,6 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
         const dim3 reduce_gridsize = {arg.grid_dim.x};
         const dim3 reduce_blocksize = {arg.block_dim.x};
         constexpr int32_t reduce_lds_bytes = 0;
-
         float reduce_result = launch_and_time_kernel(
           stream_config,
           Q_size_k_alignment_necessary == 4

From 2c7b9bbfded2379d546ffbcb9804ad0fcb0aec1d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 5 Dec 2023 19:43:49 -0500
Subject: [PATCH 283/641] fix a few bugs

---
 .../hip_fmha/attention_forward_splitk.cpp     |  7 +++---
 .../ck_attention_forward_decoder_splitk.h     | 22 +++++++++----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index f0406b522..5998f3fc8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -183,7 +183,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
             K_acc.stride(1),
             K_acc.stride(2),
             K_acc.stride(3),
-            O_acc.stride(2),
+            split_O_acc.stride(0),
             XQ_acc.size(1),
             XQ_acc.size(2),
             XQ_acc.size(3),
@@ -212,11 +212,10 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
     const at::Tensor& cache_V, // [B, KV_M_MAX, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
-    int64_t split_k,
-    double qk_scale) {
+    double qk_scale,
+    int64_t split_k) {
   auto O = at::empty_like(XQ);
   constexpr auto splitk_dim = 0;
-  // auto O_unsqueeze = at::unsqueeze(O, splitk_dim);
   auto O_splits = at::stack(O, splitk_dim);
 
   TORCH_CHECK(XQ.dim() == 5);
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index e7421c7c3..486c96ee7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -183,7 +183,7 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
   compute_t global_sumexp = 0;
   compute_t global_max = ck::NumericLimits<compute_t>::Lowest();
 
-  for (size_t split_idx = 0; split_idx < split_k; ++split_idx) {
+  for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
     load_v<data_t, data_vec_t>(O_splits 
                                + b * O_stride_b 
                                + m * O_stride_m 
@@ -200,15 +200,10 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
     bool pick_new = local_max < global_max;
     compute_t log_alpha = -std::abs(local_max - global_max);
     compute_t alpha = isnan(log_alpha) ? compute_t{1} : ck::math::exp(log_alpha);
-    // assert(!isnan(alpha));
-    // assert(isnan(alpha));
     compute_t pick_current_coef = (1 + (1 - pick_new) * (alpha - 1));
-    // assert(!isnan(pick_current_coef));
     compute_t pick_new_coef = (1 + pick_new * (alpha - 1));
-    // assert(!isnan(pick_new_coef));
     global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
-    // global_O_compute.vec = pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
-    global_O_compute.vec = O_split_compute.vec;
+    global_O_compute.vec = pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
     global_max = new_max;
   }
   global_O_compute.vec /= global_sumexp;
@@ -397,7 +392,9 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
   max_qk_acc =
       wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
 
-  split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
+  if (wavefront_idx == 0 && lane_idx == 0) {
+    split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
+  }
 
   // each wavefront computes partial sum of exp.
   compute_t softmax_denominator = 0.0f;
@@ -420,13 +417,16 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](auto a, auto b) { return a + b; });
 
-  split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
+  if (wavefront_idx == 0 && lane_idx == 0) {
+    split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
+  }
   // or maybe after scaling?
   
-  const compute_t softmax_scale_factor = 1. / softmax_denominator;
+  // const compute_t softmax_scale_factor = 1. / softmax_denominator;
   // now, compute the normalization across all threads.
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+    // smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+    smem[t] = ck::math::exp(smem[t] - max_qk_acc);
   }
   __syncthreads();
 

From 709727f7c078e8b1d6ff90b5fdbd37fcbb27e8d1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 5 Dec 2023 20:50:10 -0500
Subject: [PATCH 284/641] fix an indexing bug

---
 .../ck_attention_forward_decoder_splitk.h     | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 486c96ee7..a76aacfa1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -220,7 +220,6 @@ template <
     int32_t n_loop_unroll = 16,
     int32_t n_loop_unroll_tail = 2,
     int32_t KV_M_MAX = 8192,
-    int32_t n_wavefronts_per_block = 16,
     typename compute_t = float>
 __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
     const scalar_t* __restrict__ XQ,
@@ -307,15 +306,40 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
 
   data_vec_t k_loads[n_loop_unroll] = {};
 
-  constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
+  const auto dtt = wavefronts_per_block * n_loop_unroll;
   const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
   const int32_t tt_low = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
   const int32_t tt_high = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * (split_idx + 1);
-  const int32_t dtt_tail = n_wavefronts_per_block * n_loop_unroll_tail;
-  const int32_t tt_tail_low = wavefront_idx * n_loop_unroll_tail + wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
+  const int32_t dtt_tail = wavefronts_per_block * n_loop_unroll_tail;
+  const int32_t tt_tail_low = wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
   const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
-  const int32_t t_max_unroll = (t_max / dtt) * dtt;
 
+  // if (lane_idx == 0)
+  //   printf("wavefront_idx: %d "
+  //          "t_max: %d " 
+  //          "(runtime) wavefronts_per_block: %d "
+  //          "n_loop_unroll: %d "
+  //          "n_loop_unroll_tail: %d "
+  //          "dtt: %d "
+  //          "n_unrolled_loops: %d "
+  //          "tt_low: %d "
+  //          "tt_high: %d "
+  //          "dtt_tail: %d "
+  //          "tt_tail_low: %d "
+  //          "tt_tail_high: %d "
+  //          "\n", 
+  //          wavefront_idx,
+  //          t_max, 
+  //          wavefronts_per_block,
+  //          n_loop_unroll,
+  //          n_loop_unroll_tail,
+  //          dtt,
+  //          n_unrolled_loops,
+  //          tt_low,
+  //          tt_high,
+  //          dtt_tail,
+  //          tt_tail_low,
+  //          tt_tail_high);
   for (auto tt = tt_low; tt < tt_high; tt += dtt) {
     if (lane_active_for_io) {
 #pragma unroll n_loop_unroll

From 785481c76f28719a42db0aae0239e5fec9961314 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 6 Dec 2023 13:03:28 -0500
Subject: [PATCH 285/641] stash changes

---
 xformers/ops/fmha/forward_splitk.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/forward_splitk.py
index f67fceb0c..008ce1fc7 100644
--- a/xformers/ops/fmha/forward_splitk.py
+++ b/xformers/ops/fmha/forward_splitk.py
@@ -150,7 +150,9 @@ def apply(
         print(f"{q.shape=} {k.shape=} {v.shape=}")
 
         out = cls.OPERATOR(query=q, key=k, value=v, seq_positions=seq_len, scale=qk_scale, split_k=split_k)
-
+        
+        print(f"{out.shape=}")
+        
         return out, None
 
 
From ff0ebdbf5a101e670846379e70356970baac23cb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 8 Dec 2023 12:49:44 +0000
Subject: [PATCH 286/641] Add benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py to
 benchmark mqa/gqa performance on ck-tiled fmha

---
 ...benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py | 271 ++++++++++++++++++
 1 file changed, 271 insertions(+)
 create mode 100644 xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
new file mode 100644
index 000000000..ee3326a22
--- /dev/null
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
@@ -0,0 +1,271 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import itertools
+import random
+from functools import partial
+
+import torch
+from torch.utils import benchmark
+from xformers.benchmarks.utils import benchmark_main_helper
+
+import xformers.ops
+import xformers.ops.fmha as fmha
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    bias_requires_grad: bool = False,
+):
+    NoneType = type(None)
+    if bias_type is NoneType:
+        return None
+    if bias_type is torch.Tensor:
+        attn_bias = torch.randn((1, 1, q_len, kv_len), device=device, dtype=dtype)
+        return attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return bias_type()
+    assert False, f"Unsupported bias type: {bias_type}"
+
+## ref_attention is completely the same as used by test_forward_ck_tiled.py
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+    if q.ndim == 4:
+        B, M, Hq, K = q.shape
+        _, N, Hkv, Kv = v.shape
+        nhead_ratio_qk = Hq // Hkv
+
+        def attn_bias_head(head: int):
+            if isinstance(attn_bias, torch.Tensor):
+                assert attn_bias.ndim == 4
+                _, H, _, _ = attn_bias.shape
+                assert H == Hq
+                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+                return bias_bghmn[:, :, head]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                assert attn_bias._bias.ndim == 4
+                _, H, _, _ = attn_bias._bias.shape
+                assert H == Hq
+                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    bias_bghmn[:, :, head]
+                )
+            return attn_bias
+
+        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
+
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
+                )
+                for h in range(q_bmghk.shape[3])
+            ],
+            dim=3,
+        ).reshape((B, M, Hq, Kv))
+
+    assert q.ndim == 3
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
+
+    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=dtype,
+            )
+        else:
+            attn_bias_tensor = attn_bias.to(dtype=dtype)
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+## ref_attention_bmhk is completely the same as used by test_forward_ck_tiled.py
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+min_run_time = 0.5
+device = torch.device("cuda")
+
+NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
+SHAPES = [
+    (1, 512, 8192, 64, 8, 128),
+    (1, 1024, 8192, 64, 8, 128),
+    (1, 2048, 8192, 64, 8, 128),
+    (1, 4096, 8192, 64, 8, 128),
+    (1, 8192, 8192, 64, 8, 128),
+    (1, 16384, 8192, 64, 8, 128),
+    (1, 1024, 8192, 64, 8, 64),
+    (1, 1024, 8192, 8, 1, 64),
+    (1, 1024, 8192, 4, 4, 64),
+    ##*sorted(itertools.product([1, 2], [2048, 4096], [2048, 4096], [4, 8], [1, 2], [128])),
+    ##*sorted(
+    ##    itertools.product([16], [128, 512], [512, 1024], [16], [2, 4], [64, 128])
+    #),
+]
+
+OPS = [
+    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
+    #(xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
+    # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
+    # and its performance varies a lot between runs.
+    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
+]
+
+
+def product_dict(**kwargs):
+    keys = kwargs.keys()
+    vals = kwargs.values()
+    for instance in itertools.product(*vals):
+        yield dict(zip(keys, instance))
+
+
+CASES = list(
+    product_dict(
+        shape=SHAPES,
+        num_threads=NUM_THREADS,
+        dropout_p=[0.0],
+        attn_bias_cfg=[(type(None), False)],
+        dtype=[torch.half],
+    )
+)
+
+# Add more cases with some variations
+for c in CASES.copy():
+    c = c.copy()
+    c.update(
+        random.Random(str(c["shape"])).choice(
+            [
+                ##{"dropout_p": 0.3},
+                {"attn_bias_cfg": (torch.Tensor, False)},
+                ##{"attn_bias_cfg": (torch.Tensor, True)},
+                {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
+                ##{"dtype": torch.bfloat16},
+                ##{"dtype": torch.float},
+            ]
+        )
+    )
+    CASES.append(c)
+
+
+def create_tensors(shape, dtype, requires_grad=False):
+    B, M, N, Hq, Hkv, K = shape
+    q = torch.rand([B, M, Hq, K], device=device, dtype=dtype, requires_grad=requires_grad)
+    k = torch.rand([B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad)
+    v = torch.rand([B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad)
+    return q, k, v
+
+def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+    B, M, N, Hq, Hkv, K = shape
+    q, k, v = create_tensors(shape, dtype)
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    if attn_bias_requires_grad:
+        return
+    bias = create_attn_bias(
+        attn_bias_type,
+        batch_size=B,
+        num_heads=Hq,
+        q_len=M,
+        kv_len=N,
+        device=device,
+        dtype=dtype,
+        bias_requires_grad=attn_bias_requires_grad,
+    )
+    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{Hq}-{Hkv}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}"
+    )
+
+    has_run = False
+    for fw_op, bw_op in OPS:
+        if not fw_op.supports(inp):
+            continue
+
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias, p)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": inp.attn_bias,
+                "p": dropout_p,
+                "fn": partial(
+                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
+                ),
+            },
+            label=f"attention (attn_bias={attn_bias_type})",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        has_run = True
+
+    if not has_run:
+        return
+
+    yield benchmark.Timer(
+        stmt="fn(q, k, v, attn_bias, p)",
+        globals={
+            "q": q,
+            "k": k,
+            "v": v,
+            "attn_bias": inp.attn_bias,
+            "p": dropout_p,
+            "fn": ref_attention,
+        },
+        label=f"attention (attn_bias={attn_bias_type})",
+        description="eager",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+
+benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)

From 9a8baf7baf0e65ef5b8622daf4bc96fe99eb7ee1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 8 Dec 2023 17:22:27 +0000
Subject: [PATCH 287/641] Synchronize with latest update in
 composable_kernel_tiled feature/fmha-pad-support branch

---
 third_party/composable_kernel_tiled           |   2 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  92 +++-----
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 205 ++++++------------
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |   8 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  82 +++----
 5 files changed, 124 insertions(+), 265 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index ddce91a44..e36287d5d 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit ddce91a44b2da6eb74e7e3d7bf14b54930719983
+Subproject commit e36287d5dd83b01cec46c915e4fea9fc3d1c484f
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 3003fa404..193e0989f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -157,74 +157,38 @@ struct batched_infer_masktype_attnbias_dispatched
     static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream)
     {
         const auto kargs = [&] {
-            if constexpr(FmhaKernel::kSupportsBias)
-            {
-                std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias;
-
-                bias = std::make_tuple(param.attn_bias_ptr,
-                                       param.attn_bias_strides[2],
-                                       param.attn_bias_strides[1],
-                                       param.attn_bias_strides[0]);
-
-                return FmhaKernel::MakeKargs(
-                    param.q_ptr,
-                    param.k_ptr,
-                    param.v_ptr,
-                    param.out_ptr,
-                    param.M,              // seqlen_q
-                    param.N,              // seqlen_k
-                    param.K,              // hdim_q
-                    param.Kv,             // hdim_v
-                    param.Hq / param.Hkv, // nhead_ratio_qk
-                    param.scale,
-                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
-                    param.k_strides[1],
-                    param.v_strides[1],
-                    param.out_strides[1],
-                    param.q_strides[2], // q, k, v, out tensor head-dim stride
-                    param.k_strides[2],
-                    param.v_strides[2],
-                    param.out_strides[2],
-                    param.q_strides[0], // q, k, v, out tensor batch-dim stride
-                    param.k_strides[0],
-                    param.v_strides[0],
-                    param.out_strides[0],
-                    bias);
-            }
-            else
-            {
-                return FmhaKernel::MakeKargs(
-                    param.q_ptr,
-                    param.k_ptr,
-                    param.v_ptr,
-                    param.out_ptr,
-                    param.M,              // seqlen_q
-                    param.N,              // seqlen_k
-                    param.K,              // hdim_q
-                    param.Kv,             // hdim_v
-                    param.Hq / param.Hkv, // nhead_ratio_qk
-                    param.scale,
-                    param.q_strides[1], // q, k, v, out tensor seq-dim stride
-                    param.k_strides[1],
-                    param.v_strides[1],
-                    param.out_strides[1],
-                    param.q_strides[2], // q, k, v, out tensor head-dim stride
-                    param.k_strides[2],
-                    param.v_strides[2],
-                    param.out_strides[2],
-                    param.q_strides[0], // q, k, v, out tensor batch-dim stride
-                    param.k_strides[0],
-                    param.v_strides[0],
-                    param.out_strides[0]);
-            };
+            return FmhaKernel::MakeKargs(
+                param.q_ptr,
+                param.k_ptr,
+                param.v_ptr,
+                param.attn_bias_ptr,
+                param.out_ptr,
+                param.M,              // seqlen_q
+                param.N,              // seqlen_k
+                param.K,              // hdim_q
+                param.Kv,             // hdim_v
+                param.Hq / param.Hkv, // nhead_ratio_qk
+                param.scale,
+                param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
+                param.k_strides[1],
+                param.v_strides[1],
+                param.attn_bias_strides[2],
+                param.out_strides[1],
+                param.q_strides[2], // q, k, v, bias, out tensor head-dim stride
+                param.k_strides[2],
+                param.v_strides[2],
+                param.attn_bias_strides[1],
+                param.out_strides[2],
+                param.q_strides[0], // q, k, v, bias, out tensor batch-dim stride
+                param.k_strides[0],
+                param.v_strides[0],
+                param.attn_bias_strides[0],
+                param.out_strides[0]);
         }();
 
         dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
         constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-
-        constexpr ck::index_t kWarpPerCu    = 8; // 2 warps per SIMD
-        constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
-        constexpr ck::index_t kBlockPerCu   = kWarpPerCu / kWarpPerBlock;
+        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
 
         (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
             StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 534c2c588..288629a79 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -6,7 +6,6 @@
  */
 #pragma once
 
-#include <optional>
 #include <type_traits>
 
 #include "ck/utility/common_header.hpp"
@@ -24,10 +23,11 @@
 template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdKernel
 {
-    using TilePartitioner                   = ck::remove_cvref_t<TilePartitioner_>;
-    using FmhaPipeline                      = ck::remove_cvref_t<FmhaPipeline_>;
-    using EpiloguePipeline                  = ck::remove_cvref_t<EpiloguePipeline_>;
-    static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
+    using TilePartitioner                    = ck::remove_cvref_t<TilePartitioner_>;
+    using FmhaPipeline                       = ck::remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline                   = ck::remove_cvref_t<EpiloguePipeline_>;
+    static constexpr ck::index_t kBlockSize  = FmhaPipeline::kBlockSize;
+    static constexpr ck::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
 
     using QDataType    = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
     using KDataType    = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
@@ -40,7 +40,7 @@ struct FmhaFwdKernel
     static constexpr bool kIsGroupMode     = FmhaPipeline::kIsGroupMode;
     static constexpr bool kM0NeedPadding   = FmhaPipeline::kM0NeedPadding;
     static constexpr bool kN0K1NeedPadding = FmhaPipeline::kN0K1NeedPadding;
-    static constexpr bool kSupportsBias    = FmhaPipeline::kSupportsBias;
+    static constexpr bool kHasBias         = FmhaPipeline::kHasBias;
 
     using C0MatrixMask = ck::tile_program::block::C0MatrixMask_impl<
         ck::remove_cvref_t<typename FmhaPipeline::BlockFmhaMask>>;
@@ -79,7 +79,11 @@ struct FmhaFwdKernel
               hdim_q{hdim_q_},
               hdim_v{hdim_v_},
               nhead_ratio_qk{nhead_ratio_qk_},
+#if CK_FMHA_FWD_FAST_EXP2
+              scale{static_cast<float>(scale_ * C_LOG2E)},
+#else
               scale{scale_},
+#endif
               stride_q{stride_q_},
               stride_k{stride_k_},
               stride_v{stride_v_},
@@ -100,8 +104,10 @@ struct FmhaFwdKernel
         ck::index_t seqlen_k;
         ck::index_t hdim_q;
         ck::index_t hdim_v;
-        ck::index_t nhead_ratio_qk;
 
+        // for MQA/GQA, nhead could be different. This parameter is nhead_q / nhead_k
+        // if this param is larger than 1, indicate MQA/GQA case
+        ck::index_t nhead_ratio_qk;
         float scale;
 
         ck::index_t stride_q;
@@ -128,7 +134,7 @@ struct FmhaFwdKernel
     };
 
     struct BatchModeKargs : CommonKargs,
-                            std::conditional_t<kSupportsBias, BatchModeBiasKargs, EmptyKargs>
+                            std::conditional_t<kHasBias, BatchModeBiasKargs, EmptyKargs>
     {
         __host__ constexpr BatchModeKargs(const void* q_ptr_,
                                           const void* k_ptr_,
@@ -183,8 +189,7 @@ struct FmhaFwdKernel
         ck::index_t batch_stride_o;
     };
 
-    struct GroupModeKargs : CommonKargs,
-                            std::conditional_t<kSupportsBias, CommonBiasKargs, EmptyKargs>
+    struct GroupModeKargs : CommonKargs, std::conditional_t<kHasBias, CommonBiasKargs, EmptyKargs>
     {
         __host__ constexpr GroupModeKargs(const void* q_ptr_,
                                           const void* k_ptr_,
@@ -237,10 +242,11 @@ struct FmhaFwdKernel
     public:
     using Kargs = std::conditional_t<kIsGroupMode, GroupModeKargs, BatchModeKargs>;
 
-    template <bool Cond = !kIsGroupMode && !kSupportsBias>
+    template <bool Cond = !kIsGroupMode>
     __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
                                                                       const void* k_ptr,
                                                                       const void* v_ptr,
+                                                                      const void* bias_ptr,
                                                                       void* o_ptr,
                                                                       ck::index_t seqlen_q,
                                                                       ck::index_t seqlen_k,
@@ -251,49 +257,18 @@ struct FmhaFwdKernel
                                                                       ck::index_t stride_q,
                                                                       ck::index_t stride_k,
                                                                       ck::index_t stride_v,
+                                                                      ck::index_t stride_bias,
                                                                       ck::index_t stride_o,
                                                                       ck::index_t nhead_stride_q,
                                                                       ck::index_t nhead_stride_k,
                                                                       ck::index_t nhead_stride_v,
+                                                                      ck::index_t nhead_stride_bias,
                                                                       ck::index_t nhead_stride_o,
                                                                       ck::index_t batch_stride_q,
                                                                       ck::index_t batch_stride_k,
                                                                       ck::index_t batch_stride_v,
+                                                                      ck::index_t batch_stride_bias,
                                                                       ck::index_t batch_stride_o)
-    {
-        return Kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
-                     seqlen_k,       hdim_q,         hdim_v,         nhead_ratio_qk, scale,
-                     stride_q,       stride_k,       stride_v,       stride_o,       nhead_stride_q,
-                     nhead_stride_k, nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k,
-                     batch_stride_v, batch_stride_o};
-    }
-
-    template <bool Cond = !kIsGroupMode && kSupportsBias>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              void* o_ptr,
-              ck::index_t seqlen_q,
-              ck::index_t seqlen_k,
-              ck::index_t hdim_q,
-              ck::index_t hdim_v,
-              ck::index_t nhead_ratio_qk,
-              float scale,
-              ck::index_t stride_q,
-              ck::index_t stride_k,
-              ck::index_t stride_v,
-              ck::index_t stride_o,
-              ck::index_t nhead_stride_q,
-              ck::index_t nhead_stride_k,
-              ck::index_t nhead_stride_v,
-              ck::index_t nhead_stride_o,
-              ck::index_t batch_stride_q,
-              ck::index_t batch_stride_k,
-              ck::index_t batch_stride_v,
-              ck::index_t batch_stride_o,
-              std::optional<std::tuple<const void*, ck::index_t, ck::index_t, ck::index_t>> bias =
-                  std::nullopt)
     {
         Kargs kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
                     seqlen_k,       hdim_q,         hdim_v,         nhead_ratio_qk, scale,
@@ -301,21 +276,22 @@ struct FmhaFwdKernel
                     nhead_stride_k, nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k,
                     batch_stride_v, batch_stride_o};
 
-        if(bias.has_value())
+        if constexpr(kHasBias)
         {
-            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(std::get<0>(*bias));
-            kargs.stride_bias       = std::get<1>(*bias);
-            kargs.nhead_stride_bias = std::get<2>(*bias);
-            kargs.batch_stride_bias = std::get<3>(*bias);
+            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(bias_ptr);
+            kargs.stride_bias       = stride_bias;
+            kargs.nhead_stride_bias = nhead_stride_bias;
+            kargs.batch_stride_bias = batch_stride_bias;
         }
 
         return kargs;
     }
 
-    template <bool Cond = kIsGroupMode && !kSupportsBias>
+    template <bool Cond = kIsGroupMode>
     __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
                                                                       const void* k_ptr,
                                                                       const void* v_ptr,
+                                                                      const void* bias_ptr,
                                                                       void* o_ptr,
                                                                       const void* seqstart_q_ptr,
                                                                       const void* seqstart_k_ptr,
@@ -327,55 +303,13 @@ struct FmhaFwdKernel
                                                                       ck::index_t stride_q,
                                                                       ck::index_t stride_k,
                                                                       ck::index_t stride_v,
+                                                                      ck::index_t stride_bias,
                                                                       ck::index_t stride_o,
                                                                       ck::index_t nhead_stride_q,
                                                                       ck::index_t nhead_stride_k,
                                                                       ck::index_t nhead_stride_v,
+                                                                      ck::index_t nhead_stride_bias,
                                                                       ck::index_t nhead_stride_o)
-    {
-        return Kargs{q_ptr,
-                     k_ptr,
-                     v_ptr,
-                     o_ptr,
-                     seqstart_q_ptr,
-                     seqstart_k_ptr,
-                     seqlen_k_ptr,
-                     hdim_q,
-                     hdim_v,
-                     nhead_ratio_qk,
-                     scale,
-                     stride_q,
-                     stride_k,
-                     stride_v,
-                     stride_o,
-                     nhead_stride_q,
-                     nhead_stride_k,
-                     nhead_stride_v,
-                     nhead_stride_o};
-    }
-
-    template <bool Cond = kIsGroupMode&& kSupportsBias>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              void* o_ptr,
-              const void* seqstart_q_ptr,
-              const void* seqstart_k_ptr,
-              const void* seqlen_k_ptr,
-              ck::index_t hdim_q,
-              ck::index_t hdim_v,
-              ck::index_t nhead_ratio_qk,
-              float scale,
-              ck::index_t stride_q,
-              ck::index_t stride_k,
-              ck::index_t stride_v,
-              ck::index_t stride_o,
-              ck::index_t nhead_stride_q,
-              ck::index_t nhead_stride_k,
-              ck::index_t nhead_stride_v,
-              ck::index_t nhead_stride_o,
-              std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias = std::nullopt)
     {
         Kargs kargs{q_ptr,
                     k_ptr,
@@ -397,11 +331,11 @@ struct FmhaFwdKernel
                     nhead_stride_v,
                     nhead_stride_o};
 
-        if(bias.has_value())
+        if constexpr(kHasBias)
         {
-            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(std::get<0>(*bias));
-            kargs.stride_bias       = std::get<1>(*bias);
-            kargs.nhead_stride_bias = std::get<2>(*bias);
+            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(bias_ptr);
+            kargs.stride_bias       = stride_bias;
+            kargs.nhead_stride_bias = nhead_stride_bias;
         }
 
         return kargs;
@@ -447,9 +381,8 @@ struct FmhaFwdKernel
         if constexpr(kIsGroupMode)
         {
             // get starting offset for each batch
-            const long_index_t query_start =
-                static_cast<long_index_t>(kargs.seqstart_q_ptr[i_batch]);
-            const long_index_t key_start = static_cast<long_index_t>(kargs.seqstart_k_ptr[i_batch]);
+            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
 
             batch_offset_q = query_start * kargs.stride_q;
             batch_offset_k = key_start * kargs.stride_k;
@@ -461,7 +394,7 @@ struct FmhaFwdKernel
             {
                 batch_offset_v = key_start;
             }
-            if constexpr(kSupportsBias)
+            if constexpr(kHasBias)
             {
                 batch_offset_bias = query_start * kargs.stride_bias + key_start;
             }
@@ -475,6 +408,13 @@ struct FmhaFwdKernel
             const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
             kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
 
+            // # of required blocks is different in each groups, terminate unnecessary blocks
+            // earlier
+            if(kargs.seqlen_q <= i_m0)
+            {
+                return;
+            }
+
             if(kargs.seqlen_k_ptr != nullptr)
             {
                 kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
@@ -484,16 +424,13 @@ struct FmhaFwdKernel
                 const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
                 kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
             }
-
-            if(i_m0 >= kargs.seqlen_q)
-                return;
         }
         else
         {
             batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
             batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
             batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-            if constexpr(kSupportsBias)
+            if constexpr(kHasBias)
             {
                 batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
             }
@@ -635,39 +572,29 @@ struct FmhaFwdKernel
             constexpr auto bias_dram_window_lengths =
                 make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
 
-            if constexpr(kSupportsBias)
+            if constexpr(kHasBias)
             {
-                if(kargs.bias_ptr != nullptr)
-                {
-                    const BiasDataType* bias_ptr =
-                        kargs.bias_ptr + i_nhead_ * kargs.nhead_stride_bias + batch_offset_bias;
-
-                    const auto bias_dram = [&]() {
-                        const auto bias_dram_naive =
-                            make_naive_tensor_view<AddressSpaceEnum::Global>(
-                                bias_ptr,
-                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                                make_tuple(kargs.stride_bias, 1),
-                                Number<32>{},
-                                Number<1>{});
-
-                        return pad_tensor_view(bias_dram_naive,
-                                               bias_dram_window_lengths,
-                                               Sequence<kM0NeedPadding, kN0K1NeedPadding>{});
-                    }();
-
-                    const auto bias_dram_window =
-                        make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-
-                    return run_pipeline_with(bias_dram_window);
-                }
-                else
-                {
-                    const auto dummy_bias_dram_window =
-                        make_null_tile_window(bias_dram_window_lengths);
-
-                    return run_pipeline_with(dummy_bias_dram_window);
-                }
+                const BiasDataType* bias_ptr =
+                    kargs.bias_ptr + static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                    batch_offset_bias;
+
+                const auto bias_dram = [&]() {
+                    const auto bias_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
+                        bias_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                        make_tuple(kargs.stride_bias, 1),
+                        Number<32>{},
+                        Number<1>{});
+
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           Sequence<kM0NeedPadding, kN0K1NeedPadding>{});
+                }();
+
+                const auto bias_dram_window =
+                    make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+
+                return run_pipeline_with(bias_dram_window);
             }
             else
             {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
index 7a3ab882f..ee385408c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
@@ -29,8 +29,8 @@ struct FmhaFwdTilePartitioner
         // TODO: this may need tuning
         return dim3(ck::math::integer_divide_ceil(seqlen_q_, kM0) *
                         ck::math::integer_divide_ceil(hdim_v_, kN1),
-                    batch_size_,
-                    nhead_);
+                    nhead_,
+                    batch_size_);
     }
 
     __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v)
@@ -41,8 +41,8 @@ struct FmhaFwdTilePartitioner
         const index_t num_tile_n1 = hdim_v / kN1;
 
         const index_t i_block = blockIdx.x;
-        const index_t i_batch = blockIdx.y;
-        const index_t i_nhead = blockIdx.z;
+        const index_t i_nhead = blockIdx.y;
+        const index_t i_batch = blockIdx.z;
 
         const auto f = [](index_t dividend, index_t divisor) {
             index_t quotient = dividend / divisor;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index abd0b9fc6..20bc13130 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -128,67 +128,35 @@ struct grouped_infer_masktype_attnbias_dispatched
     static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream)
     {
         const auto kargs = [&] {
-            if constexpr(FmhaKernel::kSupportsBias)
-            {
-                std::optional<std::tuple<const void*, ck::index_t, ck::index_t>> bias;
-
-                bias = std::make_tuple(
-                    param.attn_bias_ptr, param.attn_bias_strides[2], param.attn_bias_strides[1]);
-
-                return FmhaKernel::MakeKargs(
-                    param.q_ptr,
-                    param.k_ptr,
-                    param.v_ptr,
-                    param.out_ptr,
-                    param.seqstart_q_dev_ptr,
-                    param.seqstart_k_dev_ptr,
-                    param.seqlen_k_dev_ptr,
-                    param.K,              // hdim_q
-                    param.Kv,             // hdim_v
-                    param.Hq / param.Hkv, // nhead_ratio_qk
-                    param.scale,
-                    param.q_strides[0], // q, k, v, out tensor seq-dim stride
-                    param.k_strides[0],
-                    param.v_strides[0],
-                    param.out_strides[0],
-                    param.q_strides[1], // q, k, v, out tensor head-dim stride
-                    param.k_strides[1],
-                    param.v_strides[1],
-                    param.out_strides[1],
-                    bias);
-            }
-            else
-            {
-                return FmhaKernel::MakeKargs(
-                    param.q_ptr,
-                    param.k_ptr,
-                    param.v_ptr,
-                    param.out_ptr,
-                    param.seqstart_q_dev_ptr,
-                    param.seqstart_k_dev_ptr,
-                    param.seqlen_k_dev_ptr,
-                    param.K,              // hdim_q
-                    param.Kv,             // hdim_v
-                    param.Hq / param.Hkv, // nhead_ratio_qk
-                    param.scale,
-                    param.q_strides[0], // q, k, v, out tensor seq-dim stride
-                    param.k_strides[0],
-                    param.v_strides[0],
-                    param.out_strides[0],
-                    param.q_strides[1], // q, k, v, out tensor head-dim stride
-                    param.k_strides[1],
-                    param.v_strides[1],
-                    param.out_strides[1]);
-            };
+            return FmhaKernel::MakeKargs(
+                param.q_ptr,
+                param.k_ptr,
+                param.v_ptr,
+                param.attn_bias_ptr,
+                param.out_ptr,
+                param.seqstart_q_dev_ptr,
+                param.seqstart_k_dev_ptr,
+                param.seqlen_k_dev_ptr,
+                param.K,              // hdim_q
+                param.Kv,             // hdim_v
+                param.Hq / param.Hkv, // nhead_ratio_qk
+                param.scale,
+                param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
+                param.k_strides[0],
+                param.v_strides[0],
+                param.attn_bias_strides[2],
+                param.out_strides[0],
+                param.q_strides[1], // q, k, v, bias, out tensor head-dim stride
+                param.k_strides[1],
+                param.v_strides[1],
+                param.attn_bias_strides[1],
+                param.out_strides[1]);
         }();
 
         dim3 kGridSize =
             FmhaKernel::GridSize(param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
-        constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-
-        constexpr ck::index_t kWarpPerCu    = 8; // 2 warps per SIMD
-        constexpr ck::index_t kWarpPerBlock = kBlockSize.x / warpSize;
-        constexpr ck::index_t kBlockPerCu   = kWarpPerCu / kWarpPerBlock;
+        constexpr dim3 kBlockSize         = FmhaKernel::BlockSize();
+        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
 
         (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
             StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);

From 959ae7f71c9d29b1aa18d3ddd8a1d99dad92c4cd Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 8 Dec 2023 22:08:50 +0000
Subject: [PATCH 288/641] Tiny fix in
 benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py

---
 xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
index ee3326a22..9984644bb 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
@@ -221,7 +221,7 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
         torch.float: "f32",
     }[dtype]
     sub_label = (
-        f"{dtype_str} {B}-{M}-{Hq}-{Hkv}-{K}, p={dropout_p}, "
+        f"{dtype_str} {B}-{M}-{N}-{Hq}-{Hkv}-{K}, p={dropout_p}, "
         f"BiasT={attn_bias_type.__name__}"
     )
 

From cc2f487d64c35936e18cfa4234a016c81a376ed7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 15:26:08 +0000
Subject: [PATCH 289/641] Synchronize with latest update in
 composable_kernel_tiled and make all unit_tests passed

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index e36287d5d..60795e0c1 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit e36287d5dd83b01cec46c915e4fea9fc3d1c484f
+Subproject commit 60795e0c1a9f08a9b1d479dda69faa9034b863ae

From 2162b45ae34b60f5bb305bfa9148fbe34d7302b3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 16:26:30 +0000
Subject: [PATCH 290/641] Swith to new branch for composable_kernel_tiled
 submodule

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index bf2678053..0e8e306fe 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -11,4 +11,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/asroy/ck_tile
-        branch = feature/fmha-pad-support
+        branch = fmha_attemp_async_copy_unify 
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 60795e0c1..c1814f90e 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 60795e0c1a9f08a9b1d479dda69faa9034b863ae
+Subproject commit c1814f90e2dd5b0659c6e1ed577fb1bba596c126

From d6cf5451dd5c387750fc8d58ac1c41c08f0fdb02 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 16:27:15 +0000
Subject: [PATCH 291/641] Add bfp16 instances for ck-tiled inference

---
 .../attention_forward_generic_ck_tiled.cpp    | 12 ++---
 .../ck_tiled_fmha_batched_infer_bp16.cpp      | 53 +++++++++++++++++++
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      | 53 +++++++++++++++++++
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp | 13 +++++
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp | 13 +++++
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp | 13 +++++
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp | 13 +++++
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp | 13 +++++
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp | 13 +++++
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp | 13 +++++
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp | 13 +++++
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp | 13 +++++
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp | 13 +++++
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp | 13 +++++
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp | 13 +++++
 15 files changed, 266 insertions(+), 8 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 922f82909..dbaecf40f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -37,11 +37,9 @@ extern void grouped_forward_bp16(
 */
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
-// extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t
-// stream);
+extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
 extern void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream);
-// extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t
-// stream);
+extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream);
 
 namespace {
 
@@ -380,8 +378,7 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
             }
             else if(inDataType == at::ScalarType::BFloat16)
             {
-                // batched_infer_bp16(batched_forward_params, stream);
-                throw std::runtime_error("input data-type is not supported!");
+                batched_infer_bp16(batched_forward_params, stream);
             }
             else
                 throw std::runtime_error("input data-type is not supported!");
@@ -414,8 +411,7 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
             }
             else if(inDataType == at::ScalarType::BFloat16)
             {
-                // grouped_infer_bp16(grouped_forward_params, stream);
-                throw std::runtime_error("input data-type is not supported!");
+                grouped_infer_bp16(grouped_forward_params, stream);
             }
             else
                 throw std::runtime_error("input data-type is not supported!");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
new file mode 100644
index 000000000..c45f4ba00
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_bool_switch.h"
+#include "ck_tiled_fmha_batched_infer.h"
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
+                                                                      hipStream_t stream);
+
+void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 1)
+            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 2)
+            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
new file mode 100644
index 000000000..b0c3318af
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_bool_switch.h"
+#include "ck_tiled_fmha_grouped_infer.h"
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
+
+extern template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
+                                                                      hipStream_t stream);
+
+void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        if(param.custom_mask_type == 0)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 1)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else if(param.custom_mask_type == 2)
+            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
+                                                                                          stream);
+        else
+            throw std::runtime_error("Invalid custom_mask_type value");
+    });
+};
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..23c8375db
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..893cf803a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..ce1adafad
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..e45b01c1c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..3bf55fe50
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..861f63d35
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void
+run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
new file mode 100644
index 000000000..a5e5e5aa4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
new file mode 100644
index 000000000..d2a0f9f30
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
new file mode 100644
index 000000000..176ff416d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
new file mode 100644
index 000000000..9f9dd97f1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
new file mode 100644
index 000000000..dc213019f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
+                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
new file mode 100644
index 000000000..a63206d4e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void
+run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
+                                                                    hipStream_t stream);

From 5cfda98528131fe0d33f527d614651982c595b93 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 16:39:23 +0000
Subject: [PATCH 292/641] Update to test and benchmark scripts to include
 bfloat16

---
 tests/test_forward_ck_tiled.py                            | 8 +-------
 .../benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index 6a7512f22..e2d6abc6f 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -608,9 +608,6 @@ def test_forward(
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if dtype is torch.bfloat16:
-        pytest.skip("bfloat16 is currently not supported by ck-tiled!")
-
     if not (k == kv and (kv == 64 or kv == 128)):
         pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
 
@@ -678,7 +675,7 @@ def test_forward(
 @pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
 @pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
 @pytest.mark.parametrize("batches", [100, 64, 1])
-@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
 @pytest.mark.parametrize("op", [fmha.ck.FwOp])
 def test_mqa_forward(
@@ -705,9 +702,6 @@ def test_mqa_forward(
 
     device = torch.device("cuda")
 
-    if dtype is torch.bfloat16:
-        pytest.skip("bfloat16 is currently not supported by ck-tiled!")
-
     if not (K == Kv and (Kv == 64 or Kv == 128)):
         pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
 
diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
index 9984644bb..d2e57b849 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
@@ -168,7 +168,7 @@ def product_dict(**kwargs):
         num_threads=NUM_THREADS,
         dropout_p=[0.0],
         attn_bias_cfg=[(type(None), False)],
-        dtype=[torch.half],
+        dtype=[torch.half, torch.bfloat16],
     )
 )
 

From ab605478530ee9a6780960ab6c556eb6b2df7994 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 16:57:58 +0000
Subject: [PATCH 293/641] Tiny update to ck_tiled kernel

---
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 42 ++++++++-----------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 288629a79..a36f3cb1c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -18,7 +18,9 @@
 // P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
 // O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
 
+#ifndef C_LOG2E
 #define C_LOG2E 1.44269504088896340736 // log2(e)
+#endif
 
 template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdKernel
@@ -550,28 +552,12 @@ struct FmhaFwdKernel
             make_tile_window(v_dram,
                              make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
                              {i_n1, 0});
-
-        const auto run_pipeline_with = [&](auto bias_dram_window) {
-            C0MatrixMask casual_mask{kargs.seqlen_q, kargs.seqlen_k};
-
-            return FmhaPipeline{}(q_dram_window,
-                                  k_dram_window,
-                                  v_dram_window,
-                                  bias_dram_window,
-                                  casual_mask,
-                                  kargs.scale,
-                                  ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
-                                  ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
-                                  smem_ptr);
-        };
-
         /// FIXME: Before C++20, capturing structured binding variables is not supported. Remove
         /// following copy capture of the 'i_nhead'
         ///        if compiled in C++20
-        auto o_acc_tile = [&, i_nhead_ = i_nhead]() {
+        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
             constexpr auto bias_dram_window_lengths =
                 make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
-
             if constexpr(kHasBias)
             {
                 const BiasDataType* bias_ptr =
@@ -591,19 +577,27 @@ struct FmhaFwdKernel
                                            Sequence<kM0NeedPadding, kN0K1NeedPadding>{});
                 }();
 
-                const auto bias_dram_window =
-                    make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-
-                return run_pipeline_with(bias_dram_window);
+                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
             }
             else
             {
-                const auto dummy_bias_dram_window = make_null_tile_window(bias_dram_window_lengths);
-
-                return run_pipeline_with(dummy_bias_dram_window);
+                return make_null_tile_window(bias_dram_window_lengths);
             }
         }();
 
+        C0MatrixMask casual_mask{kargs.seqlen_q, kargs.seqlen_k};
+
+        auto o_acc_tile =
+            FmhaPipeline{}(q_dram_window,
+                           k_dram_window,
+                           v_dram_window,
+                           bias_dram_window,
+                           casual_mask,
+                           kargs.scale,
+                           ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
+                           ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
+                           smem_ptr);
+
         // O DRAM and O DRAM window
         auto o_dram = [&]() {
             const auto o_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(

From a2af789e85642812f4f342d28bc75fd1746e20e5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 17:21:21 +0000
Subject: [PATCH 294/641] Change to benchmark_mem_eff_attn_mqa_gqa_ck_tiled
 benchmark cases

---
 .../benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
index d2e57b849..69b092788 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
@@ -131,15 +131,15 @@ def T(t):
 
 NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
 SHAPES = [
-    (1, 512, 8192, 64, 8, 128),
-    (1, 1024, 8192, 64, 8, 128),
-    (1, 2048, 8192, 64, 8, 128),
-    (1, 4096, 8192, 64, 8, 128),
+    (1, 512, 512, 64, 8, 128),
+    (1, 1024, 1024, 64, 8, 128),
+    (1, 2048, 2048, 64, 8, 128),
+    (1, 4096, 4096, 64, 8, 128),
     (1, 8192, 8192, 64, 8, 128),
-    (1, 16384, 8192, 64, 8, 128),
-    (1, 1024, 8192, 64, 8, 64),
-    (1, 1024, 8192, 8, 1, 64),
-    (1, 1024, 8192, 4, 4, 64),
+    (1, 16384, 16384, 64, 8, 128),
+    (1, 1024, 1024, 64, 8, 64),
+    (1, 1024, 1024, 8, 1, 64),
+    (1, 1024, 1024, 4, 4, 64),
     ##*sorted(itertools.product([1, 2], [2048, 4096], [2048, 4096], [4, 8], [1, 2], [128])),
     ##*sorted(
     ##    itertools.product([16], [128, 512], [512, 1024], [16], [2, 4], [64, 128])

From d957dd98a220c1a999eb135286896e1a59349c6a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 11 Dec 2023 14:03:08 -0500
Subject: [PATCH 295/641] stash changes

---
 tests/test_mem_eff_attention_ck.py            |  4 +++
 .../hip_fmha/attention_decoder_splitk.cpp     |  8 ------
 .../ck_attention_forward_decoder_splitk.h     | 26 +++++++++----------
 3 files changed, 16 insertions(+), 22 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 073adcc4d..3f17eebf8 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1807,6 +1807,10 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
 @pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
 @pytest.mark.parametrize("padding", [32, 4096])
 @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
+# @pytest.mark.parametrize("dtype", ["f16"])
+# @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+# @pytest.mark.parametrize("n_heads", [16])
+# @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
 def test_decoder(
     op,
     n_heads: int,
diff --git a/xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp
deleted file mode 100644
index e535ddb7e..000000000
--- a/xformers/csrc/attention/hip_fmha/attention_decoder_splitk.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <ATen/Dispatch.h>
-#include <ATen/Functions.h>
-#include <ATen/Tensor.h>
-#include <c10/cuda/CUDAStream.h>
-#include <ck/utility/data_type.hpp>
-#include <ck/utility/inner_product.hpp>
-#include <torch/library.h>
-
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index a76aacfa1..29f330b29 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -111,16 +111,16 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
   const compute_t* __restrict__ split_max,
   const compute_t* __restrict__ split_sumexp,
   scalar_t* __restrict__ O,
-  int32_t Q_size_m,
-  int32_t Q_size_g,
-  int32_t Q_size_h,
-  int32_t Q_size_k,
-  ptrdiff_t O_stride_split,
-  ptrdiff_t O_stride_b,
-  ptrdiff_t O_stride_m,
-  ptrdiff_t O_stride_g,
-  ptrdiff_t O_stride_h,
-  int32_t split_k
+  const int32_t Q_size_m,
+  const int32_t Q_size_g,
+  const int32_t Q_size_h,
+  const int32_t Q_size_k,
+  const ptrdiff_t O_stride_split,
+  const ptrdiff_t O_stride_b,
+  const ptrdiff_t O_stride_m,
+  const ptrdiff_t O_stride_g,
+  const ptrdiff_t O_stride_h,
+  const int32_t split_k
 ) {
   
   // Each block handles a single batch and head and query and group
@@ -444,12 +444,10 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
   if (wavefront_idx == 0 && lane_idx == 0) {
     split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
   }
-  // or maybe after scaling?
   
-  // const compute_t softmax_scale_factor = 1. / softmax_denominator;
   // now, compute the normalization across all threads.
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    // smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) { 
+    // softmax scale by sumexp will happen in the reduction kernel
     smem[t] = ck::math::exp(smem[t] - max_qk_acc);
   }
   __syncthreads();

From 40aa88435a10f95de8ff4a055433967281686151 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 22:53:00 +0000
Subject: [PATCH 296/641] Use Async pipeline for no M/N0K1 padding cases

---
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 193e0989f..9ad19cb6f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -21,6 +21,7 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs_default_policy.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
@@ -115,7 +116,7 @@ struct batched_infer_masktype_attnbias_dispatched
                 using FmhaTraits = ck::tile_program::TileFmhaTraits<false, false, has_attn_bias>;
                 using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
                 using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<FmhaPipelineProblem>;
                 using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
                 RunWithKernel<FmhaKernel>(param, stream);

From 73e97d8f5d4f4c2853be081916db7e864cc1b552 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 11 Dec 2023 23:32:24 +0000
Subject: [PATCH 297/641] Add CF_FMHA_FWD_FAST_EXP2 to buiding

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 9f21987ad..673e760a5 100644
--- a/setup.py
+++ b/setup.py
@@ -336,6 +336,8 @@ def get_extensions():
                     f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
                     "-U__CUDA_NO_HALF_OPERATORS__",
                     "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-DCK_FMHA_FWD_FAST_EXP2=1",
+                    "-fgpu-flush-denormals-to-zero",
                 ]
                 + generator_flag
                 + cc_flag

From b0c7023c0ad46e8c26f714163961d7dc7713130c Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@fb.com>
Date: Tue, 12 Dec 2023 06:16:39 -0800
Subject: [PATCH 298/641] Add Triton FA2 forward op

---
 xformers/ops/fmha/__init__.py |   5 +-
 xformers/ops/fmha/triton.py   | 695 ++++++++++++++++++++++++++++------
 2 files changed, 573 insertions(+), 127 deletions(-)

diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 9c2733f07..5dd416bd5 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -28,8 +28,8 @@
 MemoryEfficientAttentionTritonFwdFlashBwOp = (triton.FwOp, flash.BwOp)
 MemoryEfficientAttentionFlashAttentionOp = (flash.FwOp, flash.BwOp)
 MemoryEfficientAttentionOp = (small_k.FwOp, small_k.BwOp)
-TritonFlashAttentionOp = (triton.FwOp, triton.BwOp)
-MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp) 
+TritonFlashAttentionOp = (triton.FwOp, cutlass.BwOp if torch.version.cuda else ck.BwOp)
+MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp)
 MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
 
 class _fMHA(torch.autograd.Function):
@@ -395,7 +395,6 @@ def _memory_efficient_attention_backward(
 ALL_BW_OPS: Sequence[Type[AttentionBwOpBase]] = [
     cutlass.BwOp,
     flash.BwOp,
-    triton.BwOp,
     small_k.BwOp,
 ]
 
diff --git a/xformers/ops/fmha/triton.py b/xformers/ops/fmha/triton.py
index 2d6e2a059..d575dca27 100644
--- a/xformers/ops/fmha/triton.py
+++ b/xformers/ops/fmha/triton.py
@@ -3,63 +3,432 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+"""
+Triton Flash Attention 2
+Based on
+https://github.com/openai/triton/blob/293b7fd592a1602f2305c1bd0bc978bbd97337d6/python/tutorials/06-fused-attention.py  # noqa: E501
+https://github.com/openai/triton/blob/293b7fd592a1602f2305c1bd0bc978bbd97337d6/python/triton/ops/flash_attention.py  # noqa: E501
+https://github.com/Dao-AILab/flash-attention/blob/dd9a6fa45a9b90ff954d2b3f3f44241b9216190e/flash_attn/flash_attn_triton.py  # noqa: E501
+https://github.com/ROCmSoftwarePlatform/triton/blob/670ae8054da008424097989a5b6e3816aa601e07/python/perf-kernels/06-fused-attention-transV.py  # noqa: E501
+"""
 
 from dataclasses import replace
-from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple
+from typing import Any, List, Optional, Set, Tuple
 
 import torch
 
-from ... import _is_triton_available
+import triton
+import triton.language as tl
+
 from ..common import register_operator
 
-# This implementation needs pre-MLIR triton
-# The BW pass is not stable/well tested
-# And also does not have the latest improvements
-if TYPE_CHECKING or (False and _is_triton_available()):
-    try:
-        from flash_attn.flash_attn_triton import (
-            _flash_attn_backward,
-            _flash_attn_forward,
+from .attn_bias import (
+    BlockDiagonalCausalMask,
+    BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    LowerTriangularMask,
+)
+from .common import AttentionFwOpBase, check_lastdim_alignment_stride1, Context, Inputs
+
+
+@triton.jit
+def _fwd_kernel_triton_flash_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    K_block_ptr,
+    V_block_ptr,
+    q_seq_start,
+    lo,
+    hi,
+    start_m,
+    qk_scale,
+    kv_len,
+    offs_m,
+    offs_n,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BOUNDS_CHECKS_N: tl.constexpr,
+    CAST_BEFORE_MATMUL: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    STAGE: tl.constexpr,
+    pre_load_v: tl.constexpr,
+):
+    BOUNDS_CHECKS_STAGE: tl.constexpr = BOUNDS_CHECKS_N and STAGE == 2
+    # Doesn't seem to make a difference
+    if STAGE == 1:
+        lo = 0
+    else:
+        lo = tl.multiple_of(lo, BLOCK_N)
+        K_block_ptr = tl.advance(K_block_ptr, (0, lo))
+        V_block_ptr = tl.advance(V_block_ptr, (lo, 0))
+
+    # loop over k, v and update accumulator
+    for start_n in range(lo, hi, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)  # doesn't seem to make a difference
+        # -- load k, v --
+        k = tl.load(K_block_ptr, boundary_check=(1,) if BOUNDS_CHECKS_STAGE else ())
+        # Moving masking here seems to introduce num errors,
+        # e.g. in test_forward[tritonflashattF-cuda-torch.bfloat16-NoneType-1-256-15-1-32-32-False-BMHK]
+        # if BOUNDS_CHECKS_N or USE_SEQ_LEN:
+        #     k = tl.where(hi - tl.arange(0, BLOCK_N) > start_n, k, float("-inf"))
+        if pre_load_v:
+            v = tl.load(V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_STAGE else ())
+        # -- compute qk ---
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q.to(k.dtype), k, allow_tf32=ALLOW_TF32) * qk_scale
+        if CAST_BEFORE_MATMUL:
+            k = k.to(tl.float32)
+        if STAGE == 2:
+            if IS_CAUSAL:
+                # For some reason this is faster than start_n <= q_seq_start + offs_m[:, None] - offs_n[None, :]
+                qk = tl.where(
+                    q_seq_start + offs_m[:, None] >= (start_n + offs_n[None, :]),
+                    qk,
+                    float("-inf"),
+                )
+            if BOUNDS_CHECKS_N:
+                qk = tl.where(tl.arange(0, BLOCK_N) < hi - start_n, qk, float("-inf"))
+
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        qk = qk - m_i_new[:, None]
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk)
+
+        # -- scale and update acc --
+        acc *= alpha[:, None]
+        if not pre_load_v:
+            v = tl.load(V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_STAGE else ())
+        if CAST_BEFORE_MATMUL:
+            v = v.to(tl.float32)
+        acc += tl.dot(p.to(v.dtype), v, allow_tf32=ALLOW_TF32)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    return acc, l_i, m_i
+
+
+@triton.jit
+def _fwd_kernel_triton_flash(
+    Q,
+    K,
+    V,
+    sm_scale,
+    L,
+    Out,
+    Seq_len,
+    Seq_pos_q,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    Z,
+    H,
+    N_CTX,
+    Mkv,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BOUNDS_CHECKS_N: tl.constexpr,
+    BOUNDS_CHECKS_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    CAST_BEFORE_MATMUL: tl.constexpr,
+    USE_SEQ_LEN_KV: tl.constexpr,
+    USE_SEQ_POS_Q: tl.constexpr,
+    IS_KV_PADDED: tl.constexpr,  # Switch between padded and non-padded block-diagonal causal masks
+    pre_load_v: tl.constexpr,  # TODO: understand if that matters
+):
+    start_m = tl.program_id(0).to(tl.int64)
+    off_hz = tl.program_id(1).to(tl.int64)
+
+    tl.static_assert((IS_KV_PADDED and USE_SEQ_POS_Q) or not IS_KV_PADDED)
+
+    off_z = off_hz // H
+    off_h = off_hz % H
+    if USE_SEQ_POS_Q:
+        seqpos = tl.load(Seq_pos_q + off_z)
+        seqpos_next = tl.load(Seq_pos_q + off_z + 1)
+        q_len = seqpos_next - seqpos
+        q_offset = seqpos * stride_qm + off_h * stride_qh
+        out_offset = seqpos * stride_om + off_h * stride_oh
+        if not IS_KV_PADDED:
+            # BlockDiagonalCausalMask, no padding, use same sequence positions as for Q
+            kv_offset = seqpos * stride_kn + off_h * stride_kh
+            kv_len = q_len
+            q_seq_start = 0
+        else:
+            # BlockDiagonalCausalWithOffsetPaddedKeysMask
+            kv_offset = off_z * stride_kz + off_h * stride_kh
+            if USE_SEQ_LEN_KV:
+                kv_len = tl.load(Seq_len + off_z)
+                q_seq_start = kv_len - q_len
+            else:
+                # if no variable K/V seqlens are provided, assume full length
+                kv_len = Mkv
+                q_seq_start = 0
+    else:
+        # No mask or simple causal mask
+        q_len = N_CTX
+        q_offset = off_z * stride_qz + off_h * stride_qh
+        out_offset = off_z * stride_oz + off_h * stride_oh
+
+        kv_len = Mkv
+        q_seq_start = 0
+        kv_offset = off_z * stride_kz + off_h * stride_kh
+
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(q_len, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + kv_offset,
+        shape=(BLOCK_DMODEL, kv_len),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + kv_offset,
+        shape=(kv_len, BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(0, 1),
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)  # For Q
+    offs_n = tl.arange(0, BLOCK_N)  # For K/V
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout on NV GPUs but in VGPRs on AMD GPUs
+    q = tl.load(
+        Q_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_M or USE_SEQ_POS_Q else ()
+    )
+
+    # The loop over K/V sequence blocks is divided into two stages:
+    # Stage 1: (many) blocks which don't need boundary conditions checks - not touching sequence end or diagonal
+    # Stage 2: (few) blocks which need boundary conditions checks
+    # Following https://github.com/openai/triton/blob/293b7fd592a1602f2305c1bd0bc978bbd97337d6/python/tutorials/06-fused-attention.py  # noqa: E501
+
+    """
+    Iteration doesn't need masking if
+        - 1) block doesn't cross the diagonal: max(kv_pos) <= min(q_pos)
+        - 2) block doesn't cross the end of the sequence: max(kv_pos) < kv_len
+    Find maximum start_n for which condition 1 is satisifed.
+    Remember that
+        q_pos = q_seq_start + offs_m[:, None]
+        kv_pos = start_n + offs_n[None, :]
+        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_n = tl.arange(0, BLOCK_N)
+    min(q_pos) = q_seq_start + start_m * BLOCK_M
+    max(kv_pos) = start_n + BLOCK_N - 1
+    So the condition becomes
+        q_seq_start + start_m * BLOCK_M >= start_n + BLOCK_N - 1
+    So:
+    1) start_n <= q_seq_start + start_m * BLOCK_M - BLOCK_N + 1
+    2) start_n <= kv_len - BLOCK_N
+
+    So the last allowed start_n without masking is min(q_seq_start + start_m * BLOCK_M + 1, kv_len) - BLOCK_N
+    """
+    # Second stage can only be skipped if no mask is used and K/V length is divisible by the tile size
+    TWO_STAGES: tl.constexpr = BOUNDS_CHECKS_N or (
+        IS_CAUSAL or (USE_SEQ_LEN_KV or (USE_SEQ_POS_Q and not IS_KV_PADDED))
+    )
+    if TWO_STAGES:
+        # Border between two stages
+        hi_stage_1 = min(q_seq_start + start_m * BLOCK_M + 1, kv_len) - BLOCK_N
+        hi_stage_1 = (
+            hi_stage_1 // BLOCK_N
+        ) * BLOCK_N  # Don't understand why it doesn't work without this
+    else:
+        hi_stage_1 = kv_len
+
+    # Stage 1 - no boundary conditions
+    acc, l_i, m_i = _fwd_kernel_triton_flash_inner(
+        acc,
+        l_i,
+        m_i,
+        q,
+        K_block_ptr,
+        V_block_ptr,
+        q_seq_start,
+        0,
+        hi_stage_1,
+        start_m,
+        qk_scale,
+        kv_len,
+        offs_m,
+        offs_n,
+        BLOCK_M,
+        BLOCK_N,
+        IS_CAUSAL,
+        BOUNDS_CHECKS_N,
+        CAST_BEFORE_MATMUL,
+        ALLOW_TF32,
+        STAGE=1,
+        pre_load_v=pre_load_v,
+    )
+    if TWO_STAGES:
+        hi = (
+            tl.minimum(kv_len, q_seq_start + (start_m + 1) * BLOCK_M)
+            if IS_CAUSAL
+            else kv_len
         )
-    except ImportError:
-        import importlib
-        import pathlib
-        import sys
-        import types
-
-        def import_module_from_path(path: str) -> types.ModuleType:
-            """Import a module from the given path, w/o __init__.py"""
-            module_path = pathlib.Path(path).resolve()
-            module_name = module_path.stem  # 'path/x.py' -> 'x'
-            spec = importlib.util.spec_from_file_location(module_name, module_path)  # type: ignore
-            assert isinstance(spec, importlib.machinery.ModuleSpec)
-            module = importlib.util.module_from_spec(spec)  # type: ignore
-            sys.modules[module_name] = module
-            assert isinstance(spec.loader, importlib.abc.Loader)
-            spec.loader.exec_module(module)
-            return module
-
-        flash_attn = import_module_from_path(
-            "third_party/flash-attention/flash_attn/flash_attn_triton.py"
+        # Do we need this barrier?
+        # tl.debug_barrier()
+        # Stage 2 - with boundary conditions
+        acc, l_i, m_i = _fwd_kernel_triton_flash_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            q_seq_start,
+            hi_stage_1,
+            hi,
+            start_m,
+            qk_scale,
+            kv_len,
+            offs_m,
+            offs_n,
+            BLOCK_M,
+            BLOCK_N,
+            IS_CAUSAL,
+            BOUNDS_CHECKS_N,
+            CAST_BEFORE_MATMUL,
+            ALLOW_TF32,
+            STAGE=2,
+            pre_load_v=pre_load_v,
         )
-        _flash_attn_backward = flash_attn._flash_attn_backward
-        _flash_attn_forward = flash_attn._flash_attn_forward
-
-    triton_flash_backward = _flash_attn_backward
-    triton_flash_forward = _flash_attn_forward
-else:
-    triton_flash_backward = None
-    triton_flash_forward = None
-
-from .attn_bias import LowerTriangularMask
-from .common import (
-    AttentionBwOpBase,
-    AttentionFwOpBase,
-    Context,
-    Gradients,
-    Inputs,
-    check_lastdim_alignment_stride1,
-)
+
+    # write back l and m
+    acc1 = acc / l_i[:, None]
+    l_ptrs = L + off_hz * N_CTX + offs_m
+    # Save LSE, converting from log2 to natural logarithm
+    l_mask = (
+        start_m * BLOCK_M + tl.arange(0, BLOCK_M) < q_len if BOUNDS_CHECKS_M else None
+    )
+    tl.store(l_ptrs, (m_i + tl.math.log2(l_i)) / 1.44269504, mask=l_mask)
+    # write back O
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + out_offset,
+        shape=(q_len, BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    tl.store(
+        O_block_ptr,
+        acc1.to(Out.dtype.element_ty),
+        boundary_check=(0,) if BOUNDS_CHECKS_M or USE_SEQ_POS_Q else (),
+    )
+
+
+_autotuner_config_amd_full = [
+    triton.Config(
+        {"BLOCK_M": 256, "BLOCK_N": 64, "waves_per_eu": 2, "pre_load_v": False},
+        num_stages=1,
+        num_warps=8,
+    ),
+    triton.Config(
+        {"BLOCK_M": 128, "BLOCK_N": 128, "waves_per_eu": 2, "pre_load_v": False},
+        num_stages=1,
+        num_warps=4,
+    ),
+    triton.Config(
+        {"BLOCK_M": 256, "BLOCK_N": 128, "waves_per_eu": 2, "pre_load_v": False},
+        num_stages=1,
+        num_warps=8,
+    ),
+    triton.Config(
+        {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "pre_load_v": True},
+        num_stages=1,
+        num_warps=4,
+    ),  # d64-False
+    triton.Config(
+        {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "pre_load_v": False},
+        num_stages=1,
+        num_warps=4,
+    ),  # d64-True
+]
+
+
+_autotuner_config_amd_dummy = [
+    triton.Config(
+        {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 2, "pre_load_v": False},
+        num_stages=1,
+        num_warps=8,
+    ),
+]
+
+_autotuner_config_nvidia_dummy = [
+    triton.Config(
+        {"BLOCK_M": 128, "BLOCK_N": 64, "pre_load_v": False},
+        num_stages=1,
+        num_warps=8,
+    ),
+]
+
+
+def autotune_kernel(kernel, autotune):
+
+    kernel = triton.heuristics(
+        values={
+            "BOUNDS_CHECKS_N": lambda args: ((args["Mkv"] % args["BLOCK_N"]) != 0)
+            or (args["USE_SEQ_POS_Q"] and not args["IS_KV_PADDED"]),
+            "BOUNDS_CHECKS_M": lambda args: (args["N_CTX"] % args["BLOCK_M"]) != 0,
+        }
+    )(kernel)
+
+    if torch.version.cuda:
+        configs = _autotuner_config_nvidia_dummy
+    elif autotune:
+        configs = _autotuner_config_amd_full
+    else:
+        configs = _autotuner_config_amd_dummy
+
+    kernel = triton.autotune(
+        configs=configs,
+        key=["Z", "H", "N_CTX", "IS_CAUSAL", "BLOCK_DMODEL"],
+    )(kernel)
+    return kernel
+
+
+_fwd_kernel_triton_flash_maybe_autotuned = {
+    True: autotune_kernel(_fwd_kernel_triton_flash, True),
+    False: autotune_kernel(_fwd_kernel_triton_flash, False),
+}
 
 
 def _prepare_inputs(inp: Inputs) -> Inputs:
@@ -85,7 +454,7 @@ class FwOp(AttentionFwOpBase):
         `Phil Tillet's code <https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py>`_
     """
 
-    OPERATOR = triton_flash_forward
+    OPERATOR = _fwd_kernel_triton_flash
     SUPPORTED_DEVICES = {"cuda"}
     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
     SUPPORTED_DTYPES = {torch.half, torch.bfloat16}
@@ -93,33 +462,88 @@ class FwOp(AttentionFwOpBase):
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
         type(None),
         LowerTriangularMask,
-        # TODO: backwards accuracy is failing for a few cases, perhaps we want to disable this for now.
-        # torch.Tensor,
+        BlockDiagonalCausalMask,
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
     }
     SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
     NAME = "tritonflashattF"
 
+    # Off by default to avoid slowing down tests.
+    # Needs to be turned on explicitly in benchmarks, in prod, and in a small number of tests
+    AUTOTUNE = False
+
+    ERROR_ATOL: Mapping[torch.dtype, float] = {
+        torch.half: 2e-2,
+        torch.bfloat16: 2e-2,
+    }
+
+    ERROR_RTOL: Mapping[torch.dtype, float] = {
+        torch.half: 2e-2,
+        torch.bfloat16: 2e-2,
+    }
+
+    @classmethod
+    def shape_not_supported_reasons(
+        cls, Mq: int, Mkv: int, K: int, Kv: int
+    ) -> List[str]:
+        reasons = super().shape_not_supported_reasons(Mq, Mkv, K, Kv)
+        if K not in {32, 64, 128}:
+            reasons.append(f"Embed dim {K} not supported")
+        return reasons
+
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(FwOp, cls).not_supported_reasons(d)
         check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
         check_lastdim_alignment_stride1(reasons, "key", d.key, 8)
         check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
-        if cls.OPERATOR is None:
-            reasons.append("triton is not available")
-        if d.device.type == "cuda":
+
+        if isinstance(
+            d.attn_bias,
+            BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        ):
+            # Support padded causal block-diagonal mask if the distance between each two consecutive key starts
+            # is equal to the padding (key lengths can vary)
+            batch_size = len(d.attn_bias.q_seqinfo.seqstart_py) - 1
+            B_T = d.key.shape[
+                1
+            ]  # For these mask types the shapes of Q/K/V are (1, B_T, H, K)
+            if B_T % batch_size:
+                reasons.append(
+                    f"K/V should be padded, but batch size {batch_size} doesn't divide B*T={B_T}"
+                )
+            else:
+                kv_maxlen = d.attn_bias.k_seqinfo.padding
+                for i, seqstart in enumerate(d.attn_bias.k_seqinfo.seqstart_py):
+                    if seqstart != i * kv_maxlen:
+                        reasons.append(
+                            "Variable K/V start positions are not supported, they should be determined "
+                            f"by kv_maxlen/padding: {d.attn_bias.k_seqinfo.seqstart_py=} {kv_maxlen=} {batch_size=}"
+                        )
+                        break
+        if isinstance(
+            d.attn_bias,
+            BlockDiagonalCausalMask,
+        ):
+            # Support padded causal block-diagonal mask if for each batch element number of queries is equal
+            # to the number of key/values, i.e. each block is square
+            for q_pos, kv_pos in zip(
+                d.attn_bias.q_seqinfo.seqstart_py, d.attn_bias.k_seqinfo.seqstart_py
+            ):
+                if q_pos != kv_pos:
+                    reasons.append(
+                        f"Position starts of Q and K/V should be the same, but got {q_pos} != {kv_pos}"
+                        f"{d.attn_bias.q_seqinfo.seqstart_py=}, {d.attn_bias.k_seqinfo.seqstart_py=}"
+                    )
+
+        if d.device.type == "cuda" and torch.version.cuda:
             # Has only been tested on 8.0 / 9.0.
             # Fails on 7.5 with illegal memory access
             if torch.cuda.get_device_capability(d.device) < (8, 0):
                 reasons.append(
                     "requires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
                 )
-        if _is_triton_available():
-            import triton
-
-            if triton.__version__ > "2.0.0":
-                reasons.append("Only work on pre-MLIR triton for now")
         return reasons
 
     @classmethod
@@ -127,75 +551,98 @@ def apply(
         cls, inp: Inputs, needs_gradient: bool
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         inp = _prepare_inputs(inp)
+        attn_bias = inp.attn_bias
+        seq_len_kv = None
+        seqstart_q = None
 
-        out, lse, softmax_scale = triton_flash_forward(
-            q=inp.query,
-            k=inp.key,
-            v=inp.value,
-            bias=inp.attn_bias if isinstance(inp.attn_bias, torch.Tensor) else None,
-            softmax_scale=inp.scale_float,
-            causal=isinstance(inp.attn_bias, LowerTriangularMask),
+        q = inp.query
+        k = inp.key
+        v = inp.value
+
+        is_bt_h_m = isinstance(
+            attn_bias,
+            (BlockDiagonalCausalWithOffsetPaddedKeysMask, BlockDiagonalCausalMask),
         )
-        return out, Context(lse=lse, out=out)
+        if is_bt_h_m:
+            # q ~ [1, B*T, H, K]
+            # TODO: do we really need to do this cast? seems fishy but
+            # I just copied it from the split-k kernel
+            attn_bias.k_seqinfo.to(inp.query.device)
+            attn_bias.q_seqinfo.to(inp.query.device)
+            seqstart_q = attn_bias.q_seqinfo.seqstart
+            B = len(seqstart_q) - 1
+            H, Kq = inp.query.shape[-2:]
+            H2, Kkv = inp.key.shape[-2:]
 
+            Mq = attn_bias.q_seqinfo.max_seqlen
+            if isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+                seq_len_kv = attn_bias.k_seqinfo.seqlen
+                # assume kv has been padded
+                k = k.reshape(B, -1, H2, Kkv)
+                v = v.reshape(B, -1, H2, Kkv)
+        else:
+            B, Mq, H, _ = q.shape
 
-@register_operator
-class BwOp(AttentionBwOpBase):
-    __doc__ = FwOp.__doc__
-
-    OPERATOR = triton_flash_backward
-    SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
-    CUDA_MINIMUM_COMPUTE_CAPABILITY = FwOp.CUDA_MINIMUM_COMPUTE_CAPABILITY
-    SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES
-    SUPPORTED_MAX_K = FwOp.SUPPORTED_MAX_K
-    SUPPORTED_ATTN_BIAS_TYPES = FwOp.SUPPORTED_ATTN_BIAS_TYPES
-    SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
-    SUPPORTS_CUSTOM_SCALE = FwOp.SUPPORTS_CUSTOM_SCALE
-    SUPPORTS_DIFFERENT_VALUE_EMBED = FwOp.SUPPORTS_DIFFERENT_VALUE_EMBED
-    NAME = "tritonflashattB"
+        # Coded for BHMK format
+        q, k, v = (
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+        )
 
-    @classmethod
-    def not_supported_reasons(cls, d: Inputs) -> List[str]:
-        reasons = super(BwOp, cls).not_supported_reasons(d)
-        check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
-        check_lastdim_alignment_stride1(reasons, "key", d.key, 8)
-        check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
-        if cls.OPERATOR is None:
-            reasons.append("triton is not available")
-        if d.device.type == "cuda":
-            if torch.cuda.get_device_capability(d.device) != (8, 0):
-                reasons.append("requires A100 GPU")
-        if _is_triton_available():
-            import triton
-
-            if triton.__version__ > "2.0.0":
-                reasons.append("Only work on pre-MLIR triton for now")
-        return reasons
+        out = torch.empty_like(q)
 
-    @classmethod
-    def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
-        inp = _prepare_inputs(inp)
+        _, _, Mkv, K = k.shape
+
+        sm_scale = K**-0.5 if inp.scale is None else inp.scale
+        L = torch.empty((B * H, Mq), device=q.device, dtype=torch.float32)
+        is_causal = inp.attn_bias is not None
+        use_seq_len_kv = seq_len_kv is not None
+        use_seq_pos_q = seqstart_q is not None
+        is_kv_padded = isinstance(
+            attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask
+        )
+
+        grid = lambda META: (triton.cdiv(Mq, META["BLOCK_M"]), B * H, 1)  # noqa: E731
+        kernel = _fwd_kernel_triton_flash_maybe_autotuned[cls.AUTOTUNE]
+        kernel[grid](
+            q,
+            k,
+            v,
+            sm_scale,
+            L,
+            out,
+            seq_len_kv,
+            seqstart_q,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            out.stride(0),
+            out.stride(1),
+            out.stride(2),
+            out.stride(3),
+            B,
+            H,
+            Mq,
+            Mkv,
+            BLOCK_DMODEL=K,
+            IS_CAUSAL=is_causal,
+            USE_SEQ_LEN_KV=use_seq_len_kv,
+            USE_SEQ_POS_Q=use_seq_pos_q,
+            IS_KV_PADDED=is_kv_padded,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+            CAST_BEFORE_MATMUL=False,
+        )
 
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            grads = Gradients(
-                dq=torch.empty_like(inp.query),
-                dk=torch.empty_like(inp.key),
-                dv=torch.empty_like(inp.value),
-            )
-            cls.OPERATOR(
-                grad,
-                inp.query,
-                inp.key,
-                inp.value,
-                ctx.out,
-                ctx.get_padded_lse(128),
-                grads.dq,
-                grads.dk,
-                grads.dv,
-                bias=inp.attn_bias if isinstance(inp.attn_bias, torch.Tensor) else None,
-                softmax_scale=inp.scale_float,
-                causal=isinstance(inp.attn_bias, LowerTriangularMask),
-            )
-        return grads
+        out = out.transpose(1, 2)
+        L = L.reshape(B, H, Mq)
+        return out, Context(lse=L, out=out)

From 63c352322d1799df302d979fabbd015784a09a32 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@fb.com>
Date: Tue, 12 Dec 2023 07:01:44 -0800
Subject: [PATCH 299/641] Add Triton Flash Attention 2 to benchmarks

---
 .../benchmarks/benchmark_mem_eff_attention.py     | 15 ++++++++++++---
 xformers/ops/common.py                            |  3 ++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention.py b/xformers/benchmarks/benchmark_mem_eff_attention.py
index e272fb947..d815eceac 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention.py
@@ -122,12 +122,21 @@ def T(t):
     ),
 ]
 
+
+class TritonFlashAttentionFwAutotuned(xformers.ops.fmha.triton.FwOp):
+    AUTOTUNE = True
+
+
 OPS = [
     (xformers.ops.fmha.cutlass.FwOp, xformers.ops.fmha.cutlass.BwOp),
     (xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
-    # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
-    # and its performance varies a lot between runs.
-    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
+    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
+    (
+        TritonFlashAttentionFwAutotuned,
+        xformers.ops.fmha.cutlass.BwOp
+        if torch.version.cuda
+        else xformers.ops.fmha.ck.BwOp,
+    ),
 ]
 
 
diff --git a/xformers/ops/common.py b/xformers/ops/common.py
index 7fad34f05..fed2fe36d 100644
--- a/xformers/ops/common.py
+++ b/xformers/ops/common.py
@@ -34,7 +34,8 @@ class BaseOperator:
 
     @classmethod
     def is_available(cls) -> bool:
-        if cls.OPERATOR is None or cls.OPERATOR.__name__ == "no_such_operator":
+        # cls.OPERATOR can be either a kernel or a Triton Autotuner object, which doesn't have __name__
+        if cls.OPERATOR is None or getattr(cls.OPERATOR, "__name__", "") == "no_such_operator":
             return False
         return True
 

From fbd836ab13d26d41b0012fb9e5d90a1fae361a1f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 12 Dec 2023 17:30:04 +0000
Subject: [PATCH 300/641] Synchronize with latest third_party/composable_kernel
 and remove the inner_product bhalf_t overloading in
 ck_attention_forward_decoder.h

---
 third_party/composable_kernel                 |  2 +-
 .../hip_fmha/ck_attention_forward_decoder.h   | 38 +------------------
 2 files changed, 2 insertions(+), 38 deletions(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 5f4e6ec00..8f0627f54 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 5f4e6ec00d12654e3897f53b48307434cd25a02f
+Subproject commit 8f0627f542f2ef9fd217ae1741531e2862dcb0fc
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 08d0dbe06..cbb6749be 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -13,42 +13,6 @@
 #include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
 
-namespace ck {
-template <>
-__device__ void inner_product<bhalf_t, bhalf_t, float>(const bhalf_t& a, const bhalf_t& b, float& c)
-{
-    inner_product(type_convert<float>(a), type_convert<float>(b), c);
-}
-
-template <>
-__device__ void inner_product<half_t, half_t, float>(const half_t& a, const half_t& b, float& c)
-{
-    inner_product(type_convert<float>(a), type_convert<float>(b), c);
-}
-
-template <>
-__device__ void
-inner_product<bhalf2_t, bhalf2_t, float>(const bhalf2_t& a, const bhalf2_t& b, float& c)
-{
-    const vector_type<bhalf_t, 2> a_vector{a};
-    const vector_type<bhalf_t, 2> b_vector{b};
-    ck::static_for<0, 2, 1>{}([&](auto i) {
-        inner_product(a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-    });
-}
-
-template <>
-__device__ void
-inner_product<bhalf4_t, bhalf4_t, float>(const bhalf4_t& a, const bhalf4_t& b, float& c)
-{
-    const vector_type<bhalf_t, 4> a_vector{a};
-    const vector_type<bhalf_t, 4> b_vector{b};
-    ck::static_for<0, 4, 1>{}([&](auto i) {
-        inner_product(a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-    });
-}
-} // namespace ck
-
 namespace {
 
 template <typename data_t, int32_t vec_size>
@@ -561,4 +525,4 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
 };
 } // namespace device
 } // namespace tensor_operation
-} // namespace ck
\ No newline at end of file
+} // namespace ck

From 0d15f1b4359ea5404bf4c7a7ed4dab254a854c73 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 12 Dec 2023 23:54:06 -0500
Subject: [PATCH 301/641] stash split attention testing wip

---
 .../hip_fmha/attention_forward_splitk.cpp     | 523 +++++++++++++++++-
 1 file changed, 503 insertions(+), 20 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 5998f3fc8..9ef53503e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -203,9 +203,6 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
   return O;
 }
 
-#undef AT_DISPATCH_CASE_3
-#undef AT_DISPATCH_SWITCH_3
-
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     const at::Tensor& XQ, // [B, 1, G, H, D]
@@ -216,12 +213,13 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     int64_t split_k) {
   auto O = at::empty_like(XQ);
   constexpr auto splitk_dim = 0;
+  constexpr auto rank = 5;
   auto O_splits = at::stack(O, splitk_dim);
 
-  TORCH_CHECK(XQ.dim() == 5);
-  TORCH_CHECK(cache_K.dim() == 5);
-  TORCH_CHECK(cache_V.dim() == 5);
-  TORCH_CHECK(O_splits.dim() == 6);
+  TORCH_CHECK(XQ.dim() == rank);
+  TORCH_CHECK(cache_K.dim() == rank);
+  TORCH_CHECK(cache_V.dim() == rank);
+  TORCH_CHECK(O_splits.dim() == 1 + rank);
 
   auto B = XQ.size(0);
   auto M = XQ.size(1);
@@ -257,9 +255,6 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
       TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
 }
 
-#undef AT_DISPATCH_CASE_3
-#undef AT_DISPATCH_SWITCH_3
-
 #ifdef ATTN_FWD_SPLITK_DECODER_MAIN
 
 #include <torch/torch.h>
@@ -293,39 +288,524 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 // clang-format on
 
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
+  const at::Tensor& Q,
+  const at::Tensor& K,
+  const at::Tensor& V,
+  const at::Tensor& k_seqlens
+) {
+  auto Q_scaled = Q / sqrt(Q.size(-1));
+  auto S = at::einsum("bmghk, bnghk -> bmghn", {Q_scaled, K}, at::nullopt);
+
+  auto m = std::get<0>(at::max(S, /* dim */ 1, /* keepdim */ true));
+  auto s = at::exp(at::sub(S, m));
+  
+  // causal mask
+  for (size_t b = 0; b < k_seqlens.numel(); ++b) {
+    auto seqlen = k_seqlens[b].item<int64_t>();
+    at::slice(s[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
+  }
+
+  auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+  auto O = at::einsum("bmghn, bnghk -> bmghk", {s, V}, at::nullopt);
+  return std::make_tuple(O, m, l);
+}
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename scalar_t, typename compute_t = float>
+struct FMHADecoderSplit1DeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSplit1DeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    scalar_t* __restrict__ split_O;
+    compute_t* __restrict__ split_max;
+    compute_t* __restrict__ split_sumexp;
+    const int32_t* __restrict__ seq_kv_lens;
+    const ptrdiff_t XQ_stride_b;
+    const ptrdiff_t XQ_stride_m;
+    const ptrdiff_t XQ_stride_g;
+    const ptrdiff_t XQ_stride_h;
+    const ptrdiff_t K_stride_b;
+    const ptrdiff_t K_stride_m;
+    const ptrdiff_t K_stride_g;
+    const ptrdiff_t K_stride_h;
+    const ptrdiff_t O_stride_split;
+    const int32_t Q_size_m;
+    const int32_t Q_size_g;
+    const int32_t Q_size_h;
+    const int32_t Q_size_k;
+    const int32_t K_size_m;
+    const bool multiquery;
+    const float qk_scale;
+    const int32_t split_k;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        scalar_t* __restrict__ split_O,
+        compute_t* __restrict__ split_max,
+        compute_t* __restrict__ split_sumexp,
+        const int32_t* __restrict__ seq_kv_lens,
+        const ptrdiff_t XQ_stride_b,
+        const ptrdiff_t XQ_stride_m,
+        const ptrdiff_t XQ_stride_g,
+        const ptrdiff_t XQ_stride_h,
+        const ptrdiff_t K_stride_b,
+        const ptrdiff_t K_stride_m,
+        const ptrdiff_t K_stride_g,
+        const ptrdiff_t K_stride_h,
+        const ptrdiff_t O_stride_split,
+        const int32_t Q_size_m,
+        const int32_t Q_size_g,
+        const int32_t Q_size_h,
+        const int32_t Q_size_k,
+        const int32_t K_size_m,
+        const bool multiquery,
+        const float qk_scale,
+        const int32_t split_k,
+        // launch params
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          split_O(split_O),
+          split_max(split_max),
+          split_sumexp(split_sumexp),
+          seq_kv_lens(seq_kv_lens),
+          XQ_stride_b(XQ_stride_b),
+          XQ_stride_m(XQ_stride_m),
+          XQ_stride_g(XQ_stride_g),
+          XQ_stride_h(XQ_stride_h),
+          K_stride_b(K_stride_b),
+          K_stride_m(K_stride_m),
+          K_stride_g(K_stride_g),
+          K_stride_h(K_stride_h),
+          O_stride_split(O_stride_split),
+          Q_size_m(Q_size_m),
+          Q_size_g(Q_size_g),
+          Q_size_h(Q_size_h),
+          Q_size_k(Q_size_k),
+          K_size_m(K_size_m),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          split_k(split_k),
+          // launch params
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+
+      auto Q_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+          Q_size_k_alignment_necessary = vec_size;
+        }
+      }
+
+      if (!Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported Q_size_k");
+      }
+
+      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for Q_size_k");
+      }
+
+      float split_attention_result = launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
+              : Q_size_k_alignment_necessary == 2
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
+              : nullptr,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.seq_kv_lens,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.K_stride_b,
+          arg.K_stride_m,
+          arg.K_stride_g,
+          arg.K_stride_h,
+          arg.O_stride_split,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.K_size_m,
+          arg.multiquery,
+          arg.qk_scale,
+          arg.split_k);
+
+        return split_attention_result;
+    }
+  };
+};
+
+template <typename scalar_t, typename compute_t = float>
+struct FMHADecoderReduceDeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderReduceDeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    scalar_t* __restrict__ split_O;
+    compute_t* __restrict__ split_max;
+    compute_t* __restrict__ split_sumexp;
+    const int32_t* __restrict__ seq_kv_lens;
+    const ptrdiff_t XQ_stride_b;
+    const ptrdiff_t XQ_stride_m;
+    const ptrdiff_t XQ_stride_g;
+    const ptrdiff_t XQ_stride_h;
+    const ptrdiff_t K_stride_b;
+    const ptrdiff_t K_stride_m;
+    const ptrdiff_t K_stride_g;
+    const ptrdiff_t K_stride_h;
+    const ptrdiff_t O_stride_split;
+    const int32_t Q_size_m;
+    const int32_t Q_size_g;
+    const int32_t Q_size_h;
+    const int32_t Q_size_k;
+    const int32_t K_size_m;
+    const bool multiquery;
+    const float qk_scale;
+    const int32_t split_k;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        scalar_t* __restrict__ split_O,
+        compute_t* __restrict__ split_max,
+        compute_t* __restrict__ split_sumexp,
+        const int32_t* __restrict__ seq_kv_lens,
+        const ptrdiff_t XQ_stride_b,
+        const ptrdiff_t XQ_stride_m,
+        const ptrdiff_t XQ_stride_g,
+        const ptrdiff_t XQ_stride_h,
+        const ptrdiff_t K_stride_b,
+        const ptrdiff_t K_stride_m,
+        const ptrdiff_t K_stride_g,
+        const ptrdiff_t K_stride_h,
+        const ptrdiff_t O_stride_split,
+        const int32_t Q_size_m,
+        const int32_t Q_size_g,
+        const int32_t Q_size_h,
+        const int32_t Q_size_k,
+        const int32_t K_size_m,
+        const bool multiquery,
+        const float qk_scale,
+        const int32_t split_k,
+        // launch params
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          split_O(split_O),
+          split_max(split_max),
+          split_sumexp(split_sumexp),
+          seq_kv_lens(seq_kv_lens),
+          XQ_stride_b(XQ_stride_b),
+          XQ_stride_m(XQ_stride_m),
+          XQ_stride_g(XQ_stride_g),
+          XQ_stride_h(XQ_stride_h),
+          K_stride_b(K_stride_b),
+          K_stride_m(K_stride_m),
+          K_stride_g(K_stride_g),
+          K_stride_h(K_stride_h),
+          O_stride_split(O_stride_split),
+          Q_size_m(Q_size_m),
+          Q_size_g(Q_size_g),
+          Q_size_h(Q_size_h),
+          Q_size_k(Q_size_k),
+          K_size_m(K_size_m),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          split_k(split_k),
+          // launch params
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+
+      auto Q_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+          Q_size_k_alignment_necessary = vec_size;
+        }
+      }
+
+      if (!Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported Q_size_k");
+      }
+
+      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for Q_size_k");
+      }
+
+      const dim3 reduce_gridsize = {arg.grid_dim.x};
+      const dim3 reduce_blocksize = {arg.block_dim.x};
+      constexpr int32_t reduce_lds_bytes = 0;
+      float reduce_result = launch_and_time_kernel(
+        stream_config,
+        Q_size_k_alignment_necessary == 4
+            ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
+            : Q_size_k_alignment_necessary == 2
+            ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
+            : Q_size_k_alignment_necessary == 1
+            ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 1>
+            : nullptr,
+        reduce_gridsize,
+        reduce_blocksize,
+        reduce_lds_bytes, 
+        arg.split_O,
+        arg.split_max,
+        arg.split_sumexp,
+        arg.O,
+        arg.Q_size_m,
+        arg.Q_size_g,
+        arg.Q_size_h,
+        arg.Q_size_k,
+        arg.O_stride_split,
+        arg.XQ_stride_b,
+        arg.XQ_stride_m,
+        arg.XQ_stride_g,
+        arg.XQ_stride_h,
+        arg.split_k
+      );
+      return reduce_result;
+    }
+  };
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> 
+split1_attention(const at::Tensor& XQ, const at::Tensor& K, const at::Tensor& V, const at::Tensor& seqlen) {
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+  auto D = XQ.size(4);
+
+  double qk_scale = 1. / sqrt(D);
+  constexpr auto split_k = 1;
+
+  auto O = at::empty_like(XQ);
+  constexpr auto splitk_dim = 0;
+  constexpr auto rank = 5;
+  auto split_O = at::stack(O, splitk_dim);
+  auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
+  auto split_sumexp = at::empty_like(split_max);
+
+  dim3 blocks(B * H * M * G, split_k);
+  dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
+
+  constexpr int32_t KV_M_MAX = 8192;
+  constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
+
+  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = K_MAX * sizeof(float) *
+      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  const size_t lds_bytes = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      XQ.scalar_type(),
+      "efficient_attention_forward_decoder_split1_ck_test",
+      [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSplit1DeviceOp<ck_data_t>;
+        auto op = device_op_t{};
+
+        auto XQ_acc =
+            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto K_acc =
+            K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto V_acc =
+            V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto split_O_acc = split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+        auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto seq_acc = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data();
+        auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto split_sumexp_acc = split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto arg = device_op_t::Argument(
+            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
+            split_max_acc.data(),
+            split_sumexp_acc.data(),
+            seq_acc,
+            XQ_acc.stride(0),
+            XQ_acc.stride(1),
+            XQ_acc.stride(2),
+            XQ_acc.stride(3),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.stride(3),
+            split_O_acc.stride(0),
+            XQ_acc.size(1),
+            XQ_acc.size(2),
+            XQ_acc.size(3),
+            XQ_acc.size(4),
+            K_acc.size(1),
+            K_acc.size(3) == 1,
+            qk_scale,
+            split_k,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
+      });
+  return std::make_tuple(split_O[splitk_dim], split_max, split_sumexp);
+}
+
+static void test_split1_attention() {
+  const int32_t D = 4 * kThreadsPerWavefront;
+  const int32_t B = 1;
+  const int32_t Hq = 16;
+  const int32_t Hkv = 16;
+  const int32_t G = Hq / Hkv;
+  const int32_t padding = 4096;
+  const int32_t num_queries = 1;
+  const auto scalar_type = torch::kFloat32;
+  auto options = torch::TensorOptions()
+                     .dtype(scalar_type)
+                     .layout(torch::kStrided)
+                     .device(torch::kCUDA, 1)
+                     .requires_grad(false);
+  auto int_options = options.dtype(torch::kInt);
+  auto XQ = at::randn({B, num_queries, G, Hq, D}, options);
+  auto K = at::randn({B, padding, G, G == 1 ? Hkv : 1, D}, options);
+  auto V = at::randn({B, padding, G, G == 1 ? Hkv : 1, D}, options);
+  auto seqlen = at::randint(1062, 1063, {B}, int_options);
+  
+  printf("Run libtorch split1_attention:\n");
+  auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
+
+  printf("Run hip split1_attention:\n");
+  auto hip_result = split1_attention(XQ, K, V, seqlen);
+
+  printf("Do comparison for split1_attention:\n");
+
+  auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+
+  auto O_percent_match = at::sum(O_match_mask.to(torch::kFloat32)) / O_match_mask.numel();
+  auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
+  auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
+
+  printf(
+      "Mismatched split_O elements percentage: %.2f\n",
+      1. - O_percent_match.item<float>());
+
+  printf(
+      "Mismatched split_max elements percentage: %.2f\n",
+      1. - m_percent_match.item<float>());
+
+  printf(
+      "Mismatched split_sumexp elements percentage: %.2f\n",
+      1. - m_percent_match.item<float>());
+}
+
 static void do_correctness_check() {
   const int32_t D = 4 * kThreadsPerWavefront;
   const int32_t B = 1;
-  const int32_t H = 4;
-  const int32_t G = 1;
+  const int32_t H = 16;
+  const int32_t G = 2;
+  const int32_t padding = 4096;
+  const int32_t num_queries = 1;
   auto options = torch::TensorOptions()
                      .dtype(torch::kFloat32)
                      .layout(torch::kStrided)
                      .device(torch::kCUDA, 1)
                      .requires_grad(false);
   auto int_options = options.dtype(torch::kInt);
-  auto XQ = at::randn({B, 1, G, H, D}, options);
-  auto K = at::randn({B, 4096, G, H, D}, options);
-  auto V = at::randn({B, 4096, G, H, D}, options);
-  auto seq = at::randint(63, 128, {B}, int_options);
+  auto XQ = at::randn({B, num_queries, G, H, D}, options);
+  auto K = at::randn({B, padding, G, H, D}, options);
+  auto V = at::randn({B, padding, G, H, D}, options);
+  auto seqlen = at::randint(1062, 1063, {B}, int_options);
   double qk_scale = 1. / sqrt(D);
   constexpr auto split_k = 1;
 
   auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
-      XQ, K, V, seq, qk_scale, split_k);
-  auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl<64, 2>(
-      XQ, K, V, seq, qk_scale, split_k);
+      XQ, K, V, seqlen, qk_scale, split_k);
+  auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl<64, 16>(
+      XQ, K, V, seqlen, qk_scale, split_k);
   auto mask = at::isclose(
       result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
   auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
   printf(
       "Mismatched elements percentage: %.2f\n",
       1. - percent_match.item<float>());
+  printf("k_seqlen: %d\n", seqlen.item<int32_t>());
 }
 
 int main(int argc, char** argv) {
   if (argc == 1) {
     do_correctness_check();
+
+    test_split1_attention();
   } else {
     const auto args = std::vector<std::string>(argv + 1, argv + argc);
     if (args.size() != 7) {
@@ -405,4 +885,7 @@ int main(int argc, char** argv) {
   return 0;
 }
 
-#endif // MAIN
\ No newline at end of file
+#endif // MAIN
+
+#undef AT_DISPATCH_CASE_3
+#undef AT_DISPATCH_SWITCH_3
\ No newline at end of file

From 5c1bc54067891c67d46b768d8bfd932bfde9a6c7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Dec 2023 12:41:27 +0000
Subject: [PATCH 302/641] Synchronize with latest third_party/composable_kernel
 again

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 8f0627f54..719219b9f 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 8f0627f542f2ef9fd217ae1741531e2862dcb0fc
+Subproject commit 719219b9f1f4143e5fdd657dd16b704a22821766

From a01855079d4421b6813eb42845f531c41af1e722 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Dec 2023 14:12:14 +0000
Subject: [PATCH 303/641] Synchronize with latest
 third_party/composable_kernel_tiled

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index c1814f90e..3ffae938a 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit c1814f90e2dd5b0659c6e1ed577fb1bba596c126
+Subproject commit 3ffae938aca3d595cdae4e89564a6d063c09d0b5

From 31da32e08c45acd92ada38df0e6eec66fb9646e7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Dec 2023 14:16:02 +0000
Subject: [PATCH 304/641] Change to make ck decoder buildable with both ck
 tiled or non-tiled fmha kernel

---
 setup.py                                                        | 2 +-
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 673e760a5..31e03cdb1 100644
--- a/setup.py
+++ b/setup.py
@@ -210,6 +210,7 @@ def get_extensions():
     source_cuda += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True)
 
     source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"), recursive=False)
+    source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
     
     if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
@@ -217,7 +218,6 @@ def get_extensions():
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
     else:
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_backward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"), recursive=False)
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index cbb6749be..6a7c60c0a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -12,6 +12,7 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
+#include <ck/utility/math_v2.hpp>
 
 namespace {
 

From 22c8d6fd3758a04116dd84cd07e69ab667d65d36 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 13 Dec 2023 14:16:02 +0000
Subject: [PATCH 305/641] Change to make ck decoder buildable with both ck
 tiled or non-tiled fmha kernel

---
 setup.py                                                        | 2 +-
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9f21987ad..d45399ef1 100644
--- a/setup.py
+++ b/setup.py
@@ -210,6 +210,7 @@ def get_extensions():
     source_cuda += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True)
 
     source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"), recursive=False)
+    source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
     
     if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
@@ -217,7 +218,6 @@ def get_extensions():
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
     else:
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_backward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"), recursive=False)
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index cbb6749be..6a7c60c0a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -12,6 +12,7 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
+#include <ck/utility/math_v2.hpp>
 
 namespace {
 

From 64283744405b71a67422527735b071e13216970d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 13 Dec 2023 18:57:36 -0500
Subject: [PATCH 306/641] fix gqa for split-k=1

---
 tests/test_mem_eff_attention_ck.py            |  93 ++++++++-----
 .../csrc/attention/hip_fmha/CMakeLists.txt    |   5 +
 .../hip_fmha/attention_forward_splitk.cpp     | 124 +++++++++++++-----
 xformers/ops/fmha/forward_splitk.py           |  67 +++++-----
 4 files changed, 186 insertions(+), 103 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 3f17eebf8..fcc20e0ac 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -303,6 +303,26 @@ def T(t):
 
 
 def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2) -> torch.Tensor:
+    if q.ndim == 5:
+        def attn_bias_group(group: int):
+            if isinstance(attn_bias, torch.Tensor):
+                return attn_bias[:, group]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    attn_bias._bias[:, group]
+                )
+            return attn_bias
+
+        return torch.stack(
+            [
+                ref_attention_splitk_bmhk(
+                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), split_k=split_k
+                )
+                for g in range(q.shape[2])
+            ],
+            dim=2,
+        )
+
     if q.ndim == 4:
         return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k)
     assert q.ndim == 3
@@ -1753,30 +1773,50 @@ def test_attn_bias_padded() -> None:
         rtol=fmha.ck.FwOp.ERROR_RTOL[torch.float16],
     )
 
-@pytest.mark.parametrize("multiquery", [True, False], ids=lambda x: "mq" if x else "nomq")
-@pytest.mark.parametrize("n_heads", [1, 16, 32])
-@pytest.mark.parametrize("padding", [32, 4096])
-@pytest.mark.parametrize("bsz", [1, 8])
-@pytest.mark.parametrize("dtype", ["f16"])
+
+def _kv_heads_label(kv_heads: Optional[int]) -> str:
+    if kv_heads is None:
+        return ""
+    if kv_heads == 1:
+        return "mq"
+    return f"gqa{kv_heads}"
+
+@pytest.mark.parametrize("dtype", ["f32"])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("n_heads", [16])
+@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
 @pytest.mark.parametrize("split_k", [1, 2, 4])
 def test_splitk_reference(
-    multiquery: bool, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
+    kv_heads: int, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
 ):
     dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
     torch.manual_seed(1)
     d = 256
-    k_shape = (1, bsz * padding, n_heads, d)
+    num_queries = 1
+    if kv_heads is not None and kv_heads > 1:
+        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
+        q_shape: Tuple[int, ...] = (
+            1,
+            bsz * num_queries,
+            kv_heads,
+            n_heads,
+            d,
+        )
+    else:
+        k_shape = (1, bsz * padding, n_heads, d)
+        q_shape = (1, bsz * num_queries, n_heads, d)
+
     k = torch.rand(k_shape, dtype=dtype_).cuda()
     k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
-    v = torch.rand(k_shape, dtype=dtype_).cuda()
-    q = torch.rand((1, bsz, n_heads, d), dtype=dtype_).cuda()
+    v = torch.rand_like(k)
+    q = torch.rand(q_shape, dtype=dtype_).cuda()
     causal_diagonal = torch.tensor(  # TODO: make unnecessary
         [i - 1 for i in k_seqlen], dtype=torch.int32
     ).cuda()
 
-    if multiquery:
-        k = k[:, :, :1].expand(k_shape)
-        v = v[:, :, :1].expand(k_shape)
+    if kv_heads is not None:
+        k = k[..., :1, :].expand(k_shape)
+        v = v[..., :1, :].expand(k_shape)
 
     attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
         q_seqlen=[1] * bsz,
@@ -1794,23 +1834,15 @@ def test_splitk_reference(
     )
 
 
-def _kv_heads_label(kv_heads: Optional[int]) -> str:
-    if kv_heads is None:
-        return ""
-    if kv_heads == 1:
-        return "mq"
-    return f"gqa{kv_heads}"
-
-
 @pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
-@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
-@pytest.mark.parametrize("padding", [32, 4096])
-@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
-# @pytest.mark.parametrize("dtype", ["f16"])
 # @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-# @pytest.mark.parametrize("n_heads", [16])
-# @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
+# @pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
+# @pytest.mark.parametrize("padding", [32, 4096])
+# @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
+@pytest.mark.parametrize("dtype", ["f32"])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("n_heads", [16])
+@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
 def test_decoder(
     op,
     n_heads: int,
@@ -1881,13 +1913,6 @@ def test_decoder(
         rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
     )
 
-def _kv_heads_label(kv_heads: Optional[int]) -> str:
-    if kv_heads is None:
-        return ""
-    if kv_heads == 1:
-        return "mq"
-    return f"gqa{kv_heads}"
-
 
 @pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp_S1, fmha.forward_splitk.FwOp_S2])
 @pytest.mark.parametrize("dtype", ["f16"])
diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index 056bb06bb..ee208bffe 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -7,6 +7,9 @@ message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER} (need hipcc)")
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_FLAGS "-Wall")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
+set(CMAKE_VERBOSE_MAKEFILE on)
 
 set(exe_name attention_forward_decoder_main)
 set(splitk_exe_name attention_forward_splitk_decoder_main)
@@ -42,6 +45,8 @@ target_compile_options(${splitk_exe_name} PUBLIC
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps
+  -g
+  -O0
   >
 )
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 9ef53503e..3c148a129 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -12,6 +12,43 @@ namespace {
   constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
 }
 
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
+  const at::Tensor& Q,
+  const at::Tensor& K,
+  const at::Tensor& V,
+  const at::Tensor& k_seqlens
+) {
+  auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
+  auto S = at::einsum("bmghk, bnghk -> bmghn", {Q_scaled, K}, at::nullopt);
+
+  // causal mask
+  for (size_t b = 0; b < k_seqlens.numel(); ++b) {
+    auto seqlen = k_seqlens[b].item<int64_t>();
+    at::slice(S[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
+  }
+
+  auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+  auto s = at::exp(at::sub(S, m));
+  
+  // causal mask
+  for (size_t b = 0; b < k_seqlens.numel(); ++b) {
+    auto seqlen = k_seqlens[b].item<int64_t>();
+    at::slice(s[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
+  }
+
+  auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+  auto O = at::einsum("bmghn, bnghk -> bmghk", {s, V}, at::nullopt);
+  return std::make_tuple(O, m, l);
+}
+
+static at::Tensor split1_reduce_torch(
+  const at::Tensor& O_splits,
+  const at::Tensor& m,
+  const at::Tensor& l
+) {
+  return at::div(O_splits[0], l);
+}
+
 namespace {
 
 template <typename c10_t>
@@ -242,6 +279,10 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck(
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int64_t split_k) {
+
+  // auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
+  // return split1_reduce_torch(O_split, m, l);
+
   return efficient_attention_forward_decoder_splitk_ck_impl<
       kThreadsPerWavefront,
       kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
@@ -266,7 +307,7 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 (1) hipify
  > pip install -e /xformers
 
- For obtaining all the library paths needed for compilation below, add `--verbose`.
+ For obtaining the executed build commands, add `--verbose`.
  For efficient utilization of CPU cores for compilation use MAX_JOBS env variable.
 
 (2) compile
@@ -288,28 +329,36 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 // clang-format on
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
-  const at::Tensor& Q,
-  const at::Tensor& K,
-  const at::Tensor& V,
-  const at::Tensor& k_seqlens
-) {
-  auto Q_scaled = Q / sqrt(Q.size(-1));
-  auto S = at::einsum("bmghk, bnghk -> bmghn", {Q_scaled, K}, at::nullopt);
-
-  auto m = std::get<0>(at::max(S, /* dim */ 1, /* keepdim */ true));
-  auto s = at::exp(at::sub(S, m));
+// static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
+//   const at::Tensor& Q,
+//   const at::Tensor& K,
+//   const at::Tensor& V,
+//   const at::Tensor& k_seqlens
+// ) {
+//   auto Q_scaled = Q / sqrt(Q.size(-1));
+//   auto S = at::einsum("bmghk, bnghk -> bmghn", {Q_scaled, K}, at::nullopt);
+
+//   auto m = std::get<0>(at::max(S, /* dim */ 1, /* keepdim */ true));
+//   auto s = at::exp(at::sub(S, m));
   
-  // causal mask
-  for (size_t b = 0; b < k_seqlens.numel(); ++b) {
-    auto seqlen = k_seqlens[b].item<int64_t>();
-    at::slice(s[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
-  }
+//   // causal mask
+//   for (size_t b = 0; b < k_seqlens.numel(); ++b) {
+//     auto seqlen = k_seqlens[b].item<int64_t>();
+//     at::slice(s[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
+//   }
+
+//   auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+//   auto O = at::einsum("bmghn, bnghk -> bmghk", {s, V}, at::nullopt);
+//   return std::make_tuple(O, m, l);
+// }
 
-  auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-  auto O = at::einsum("bmghn, bnghk -> bmghk", {s, V}, at::nullopt);
-  return std::make_tuple(O, m, l);
-}
+// static at::Tensor split1_reduce_torch(
+//   const at::Tensor& O_splits,
+//   const at::Tensor& m,
+//   const at::Tensor& l
+// ) {
+//   return at::div(O_splits[0], l);
+// }
 
 namespace ck {
 namespace tensor_operation {
@@ -630,8 +679,11 @@ struct FMHADecoderReduceDeviceOp : public BaseOperator {
 } // namespace tensor_operation
 } // namespace ck
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> 
-split1_attention(const at::Tensor& XQ, const at::Tensor& K, const at::Tensor& V, const at::Tensor& seqlen) {
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(
+  const at::Tensor& XQ, 
+  const at::Tensor& K, 
+  const at::Tensor& V, 
+  const at::Tensor& seqlen) {
   auto B = XQ.size(0);
   auto M = XQ.size(1);
   auto G = XQ.size(2);
@@ -735,21 +787,29 @@ static void test_split1_attention() {
                      .requires_grad(false);
   auto int_options = options.dtype(torch::kInt);
   auto XQ = at::randn({B, num_queries, G, Hq, D}, options);
-  auto K = at::randn({B, padding, G, G == 1 ? Hkv : 1, D}, options);
-  auto V = at::randn({B, padding, G, G == 1 ? Hkv : 1, D}, options);
+  auto K = (G == 1) 
+    ? at::randn({B, padding, G, Hkv, D}, options) 
+    : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
+  auto V = at::randn_like(K);
   auto seqlen = at::randint(1062, 1063, {B}, int_options);
   
-  printf("Run libtorch split1_attention:\n");
-  auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
+  // printf("Run libtorch split1_attention:\n");
+  // auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
 
   printf("Run hip split1_attention:\n");
-  auto hip_result = split1_attention(XQ, K, V, seqlen);
+  auto hip_result = split1_attention_hip(XQ, K, V, seqlen);
 
   printf("Do comparison for split1_attention:\n");
 
-  auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  // auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  // auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  // auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  // auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(reference_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  // auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(reference_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  // auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(reference_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto O_match_mask = at::isclose(std::get<0>(hip_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto m_match_mask = at::isclose(std::get<1>(hip_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto l_match_mask = at::isclose(std::get<2>(hip_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
 
   auto O_percent_match = at::sum(O_match_mask.to(torch::kFloat32)) / O_match_mask.numel();
   auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
@@ -803,7 +863,7 @@ static void do_correctness_check() {
 
 int main(int argc, char** argv) {
   if (argc == 1) {
-    do_correctness_check();
+    // do_correctness_check();
 
     test_split1_attention();
   } else {
diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/forward_splitk.py
index 008ce1fc7..0a0651fea 100644
--- a/xformers/ops/fmha/forward_splitk.py
+++ b/xformers/ops/fmha/forward_splitk.py
@@ -98,50 +98,43 @@ def apply(
         q, k, v = inp.get_qkv_in_bmghk()
         
         if attn_bias is not None:
-            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             attn_bias.k_seqinfo.to(k.device)
             attn_bias.q_seqinfo.to(q.device)
-            seq_len = attn_bias.k_seqinfo.seqlen
-            B = len(seq_len)
-            G, H, Kq = q.shape[-3:]
-            Kkv = v.shape[-1]
-
-            # assume kv has been padded
-            q = q.reshape(B, -1, G, H, Kq)
-            k = k.reshape(B, -1, G, H, Kkv)
-            v = v.reshape(B, -1, G, H, Kkv)
-        
-        mqa_swap_seqlen_head = False
-        if k.shape[3] > 1 and k.stride(3) == 0 and v.stride(3) == 0:
-            mqa_swap_seqlen_head = True
-            assert q.shape[1] == 1
-            q = q.transpose(1, 3)
-            k = k[:, :, :, :1]
-            v = v[:, :, :, :1]
-
-        Lk = k.shape[-1]
-
-        B, Mk, G, H, Kkv = k.shape
-        B, M, G, H, Kq = q.shape
-        assert Lk == Kq, f"Keys have head dim {Lk} but queries have head dim {Kq}"
-
-        BLOCK_M = cls.BLOCK_M
-        BLOCK_N = cls.BLOCK_N
+            padding = attn_bias.k_seqinfo.padding
+            seq_positions_gpu = attn_bias.k_seqinfo.seqlen
+        else:
+            padding = k.shape[1]
+            seq_positions_gpu = None
+
+        if attn_bias is not None:
+            # key: (1, B * padding, G, 1 if multiquery else Hkv, D)
+            # value: like key
+            # query: (1, B * q_seqlen, G, Hq, D)
+            multiquery = k.stride(3) == 0
+            if multiquery:
+                key = k[0, :, :, :1].unflatten(0, (-1, padding))
+                value = v[0, :, :, :1].unflatten(0, (-1, padding))
+            else:
+                key = k[0].unflatten(0, (-1, padding))
+                value = v[0].unflatten(0, (-1, padding))
+            query = q[0].unflatten(0, (key.shape[0], -1))    
+        else:
+            # key: (B, padding, G, 1 if multiquery else Hkv, D)
+            # value: like key
+            # query: (B, q_seqlen, G, Hq, D)
+            key = k
+            query = q
+            value = v
+
+        B, _, _, H, _ = query.shape
+        _, Mk, _, _, _ = key.shape
+
         if cls.SPLIT_K is not None:
             split_k = cls.SPLIT_K
         else:
             # Use heuristics
             split_k = cls.get_split_k(B, H, Mk)
 
-        M_ceil = (M + BLOCK_M - 1) // BLOCK_M * BLOCK_M
-
-        # o_splitk = torch.empty(
-        #     [B * G * H, split_k, M_ceil, Kq], dtype=torch.float32, device=q.device
-        # )
-        # metadata = torch.empty(
-        #     [B * G * H, 2, split_k, M_ceil], dtype=torch.float32, device=q.device
-        # )
-
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
@@ -149,7 +142,7 @@ def apply(
 
         print(f"{q.shape=} {k.shape=} {v.shape=}")
 
-        out = cls.OPERATOR(query=q, key=k, value=v, seq_positions=seq_len, scale=qk_scale, split_k=split_k)
+        out = cls.OPERATOR(query=query, key=key, value=value, seq_positions=seq_positions_gpu, scale=qk_scale, split_k=split_k)
         
         print(f"{out.shape=}")
         

From f21e39ad57c935cd51306f33b3c1586007941aad Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@fb.com>
Date: Sun, 17 Dec 2023 10:34:26 -0800
Subject: [PATCH 307/641] Skip backward tests, fix import

---
 tests/test_mem_eff_attention.py | 2 ++
 xformers/ops/fmha/triton.py     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index ae3f051b6..03b11b399 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -1295,6 +1295,8 @@ def test_grad_checkpointing(
         k,
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    if op is fmha.triton.FwOp:
+        pytest.skip("Triton Flash Attention 2 doesn't support backward pass yet")
     bias_type = None
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
         op,
diff --git a/xformers/ops/fmha/triton.py b/xformers/ops/fmha/triton.py
index d575dca27..6dccc1cb9 100644
--- a/xformers/ops/fmha/triton.py
+++ b/xformers/ops/fmha/triton.py
@@ -13,7 +13,7 @@
 """
 
 from dataclasses import replace
-from typing import Any, List, Optional, Set, Tuple
+from typing import Any, List, Mapping, Optional, Set, Tuple
 
 import torch
 

From 6c5540c1dc630c4632669e39d082b25236c65412 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 18 Dec 2023 17:23:48 -0500
Subject: [PATCH 308/641] fix the mask for decoding; row max and lse are
 computed correctly; debugging must go on

---
 tests/test_mem_eff_attention_ck.py            | 39 ++++++++++++-------
 .../hip_fmha/attention_forward_splitk.cpp     | 24 ++++++++----
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index fcc20e0ac..58a0d3f96 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -283,7 +283,7 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-def ref_attention_splitk_bmhk(q, k, v, attn_bias, scale=None, split_k=None) -> torch.Tensor:
+def ref_attention_splitk_bmhk(q, k, v, attn_bias, scale=None, split_k=None, dtype=None) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -297,12 +297,12 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention_splitk(T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k)
+    out = ref_attention_splitk(T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype)
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
 
-def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2) -> torch.Tensor:
+def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2, dtype=None) -> torch.Tensor:
     if q.ndim == 5:
         def attn_bias_group(group: int):
             if isinstance(attn_bias, torch.Tensor):
@@ -316,7 +316,7 @@ def attn_bias_group(group: int):
         return torch.stack(
             [
                 ref_attention_splitk_bmhk(
-                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), split_k=split_k
+                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), split_k=split_k, dtype=dtype
                 )
                 for g in range(q.shape[2])
             ],
@@ -324,11 +324,13 @@ def attn_bias_group(group: int):
         )
 
     if q.ndim == 4:
-        return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k)
+        return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype)
     assert q.ndim == 3
-    q = q.float()
-    k = k.float()
-    v = v.float()
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
 
     if scale is None:
         scale = q.shape[-1] ** -.5
@@ -392,6 +394,10 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
     # reduce out over split-k slices
 
+    # return slices[0]["row_max"].repeat_interleave(256, -1)
+
+    # return slices[0]["attn_slice"]
+
     m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
     l_current_sum = torch.zeros_like(slices[0]["row_lse"])
 
@@ -1899,12 +1905,13 @@ def test_decoder(
     decoder_output = fmha.memory_efficient_attention_forward(
         q, k, v, attn_bias, op=op
     )
-    
-    print(f"{decoder_output.shape=}")
-    nans_in_result = torch.sum(torch.isnan(decoder_output))
-    print(f"{nans_in_result=}")
 
-    ref_output = ref_attention(q, k, v, attn_bias, dtype=dtype_)
+    # attn_bias_tensor = attn_bias.materialize(shape=(q.shape[0], 1, q.shape[1], k.shape[1]), device=q.device, dtype=dtype_)
+    # print(f"{k_seqlen=}")
+    # torch.set_printoptions(threshold=None, edgeitems=256)
+    # print(f"{attn_bias_tensor.shape=} {attn_bias_tensor=}")
+
+    ref_output = ref_attention_splitk(q, k, v, attn_bias, dtype=dtype_, split_k=1)
 
     assert_allclose(
         decoder_output,
@@ -1918,7 +1925,11 @@ def test_decoder(
 @pytest.mark.parametrize("dtype", ["f16"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
-@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
+@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
+# @pytest.mark.parametrize("dtype", ["f16"])
+# @pytest.mark.parametrize("kv_heads", [None], ids=_kv_heads_label)
+# @pytest.mark.parametrize("n_heads", [16])
+# @pytest.mark.parametrize("padding, bsz", [(32, 8),])
 def test_splitk_decoder(
     op,
     kv_heads: Optional[int],
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 3c148a129..9b8a45de8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -19,12 +19,19 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
   const at::Tensor& k_seqlens
 ) {
   auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
-  auto S = at::einsum("bmghk, bnghk -> bmghn", {Q_scaled, K}, at::nullopt);
+  auto S = at::einsum("mghk, nghk -> mghn", {Q_scaled.flatten(0, 1), K.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
+
+  for (size_t i = 0; i < S.dim(); ++i) {
+    std::cout << "S.dim" << i << "=" << S.size(i) << std::endl; 
+  }
 
   // causal mask
+  auto neg_inf = at::tensor(-99.).item();
   for (size_t b = 0; b < k_seqlens.numel(); ++b) {
     auto seqlen = k_seqlens[b].item<int64_t>();
-    at::slice(S[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
+    at::slice(S[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).fill_(neg_inf);
+    at::slice(S[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ S.size(-1)).fill_(neg_inf);
+    std::cout << "batch" << b << " ; masked QK^T dim " << S[b].dim() << " values at h0 " << S[b].slice(1, 0, 1) << std::endl;
   }
 
   auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
@@ -33,12 +40,13 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
   // causal mask
   for (size_t b = 0; b < k_seqlens.numel(); ++b) {
     auto seqlen = k_seqlens[b].item<int64_t>();
-    at::slice(s[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
+    at::slice(s[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).zero_();
+    at::slice(s[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ s.size(-1)).zero_();
   }
 
   auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-  auto O = at::einsum("bmghn, bnghk -> bmghk", {s, V}, at::nullopt);
-  return std::make_tuple(O, m, l);
+  auto O = at::einsum("mghn, nghk -> mghk", {s, V.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
+  return std::make_tuple(O.reshape_as(Q), m, l);
 }
 
 static at::Tensor split1_reduce_torch(
@@ -280,8 +288,10 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck(
     double qk_scale,
     int64_t split_k) {
 
-  // auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
-  // return split1_reduce_torch(O_split, m, l);
+  auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
+  // return at::repeat_interleave(m, 256, -1);
+  // return O_split[0];
+  return split1_reduce_torch(O_split, m, l);
 
   return efficient_attention_forward_decoder_splitk_ck_impl<
       kThreadsPerWavefront,

From 5225eef366349b1cbf224b8d9af0383af6bb3b46 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 19 Dec 2023 15:24:17 -0500
Subject: [PATCH 309/641] make libtorch split-1 decoder implementation pass
 numerical correctness

---
 tests/test_mem_eff_attention_ck.py            |  7 +++--
 .../hip_fmha/attention_forward_splitk.cpp     | 29 ++++++++++++-------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 58a0d3f96..e7630d9ac 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -395,7 +395,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     # reduce out over split-k slices
 
     # return slices[0]["row_max"].repeat_interleave(256, -1)
-
+    # return slices[0]["row_lse"].repeat_interleave(256, -1)
     # return slices[0]["attn_slice"]
 
     m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
@@ -1902,6 +1902,10 @@ def test_decoder(
     if (not_supported_reasons := op.not_supported_reasons(inp)):
         pytest.skip(f"{not_supported_reasons=}")
 
+    ref_output = ref_attention_splitk(q, k, v, attn_bias, dtype=dtype_, split_k=1)
+
+    print(f"{ref_output.shape=}")
+
     decoder_output = fmha.memory_efficient_attention_forward(
         q, k, v, attn_bias, op=op
     )
@@ -1911,7 +1915,6 @@ def test_decoder(
     # torch.set_printoptions(threshold=None, edgeitems=256)
     # print(f"{attn_bias_tensor.shape=} {attn_bias_tensor=}")
 
-    ref_output = ref_attention_splitk(q, k, v, attn_bias, dtype=dtype_, split_k=1)
 
     assert_allclose(
         decoder_output,
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 9b8a45de8..79ef348d8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -21,9 +21,9 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
   auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
   auto S = at::einsum("mghk, nghk -> mghn", {Q_scaled.flatten(0, 1), K.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
 
-  for (size_t i = 0; i < S.dim(); ++i) {
-    std::cout << "S.dim" << i << "=" << S.size(i) << std::endl; 
-  }
+  // for (size_t i = 0; i < S.dim(); ++i) {
+  //   std::cout << "S.dim" << i << "=" << S.size(i) << std::endl; 
+  // }
 
   // causal mask
   auto neg_inf = at::tensor(-99.).item();
@@ -31,7 +31,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
     auto seqlen = k_seqlens[b].item<int64_t>();
     at::slice(S[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).fill_(neg_inf);
     at::slice(S[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ S.size(-1)).fill_(neg_inf);
-    std::cout << "batch" << b << " ; masked QK^T dim " << S[b].dim() << " values at h0 " << S[b].slice(1, 0, 1) << std::endl;
+    // std::cout << "batch" << b << " ; masked QK^T dim " << S[b].dim() << " values at h0 " << S[b].slice(1, 0, 1) << std::endl;
   }
 
   auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
@@ -46,7 +46,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
 
   auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
   auto O = at::einsum("mghn, nghk -> mghk", {s, V.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
-  return std::make_tuple(O.reshape_as(Q), m, l);
+  return std::make_tuple(O, m, l);
 }
 
 static at::Tensor split1_reduce_torch(
@@ -54,7 +54,7 @@ static at::Tensor split1_reduce_torch(
   const at::Tensor& m,
   const at::Tensor& l
 ) {
-  return at::div(O_splits[0], l);
+  return at::div(O_splits, l);
 }
 
 namespace {
@@ -280,6 +280,18 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
   return O;
 }
 
+at::Tensor efficient_attention_forward_decoder_split1_torch(
+  const at::Tensor& XQ, // [B, 1, G, H, D]
+  const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+  const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+  at::optional<at::Tensor> seq_kv_lens, // [B]
+  double qk_scale
+) {
+  auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
+  auto O = split1_reduce_torch(O_split, m, l);
+  return O.reshape_as(XQ);
+}
+
 at::Tensor efficient_attention_forward_decoder_splitk_ck(
     const at::Tensor& XQ, // [B, 1, G, H, D]
     const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
@@ -288,10 +300,7 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck(
     double qk_scale,
     int64_t split_k) {
 
-  auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
-  // return at::repeat_interleave(m, 256, -1);
-  // return O_split[0];
-  return split1_reduce_torch(O_split, m, l);
+  return efficient_attention_forward_decoder_split1_torch(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
 
   return efficient_attention_forward_decoder_splitk_ck_impl<
       kThreadsPerWavefront,

From 45727d64e24f03b8c0b52ef68e9ab0e08b09a3bf Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@fb.com>
Date: Wed, 20 Dec 2023 02:16:48 -0800
Subject: [PATCH 310/641] Disable CK kernel for large shapes, better catch OOMs

---
 xformers/benchmarks/utils.py |  8 +++++---
 xformers/ops/fmha/ck.py      | 22 +++++++++++++++++++---
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/xformers/benchmarks/utils.py b/xformers/benchmarks/utils.py
index b04889501..7c5f87cd4 100644
--- a/xformers/benchmarks/utils.py
+++ b/xformers/benchmarks/utils.py
@@ -557,7 +557,7 @@ def benchmark_run_and_compare(
             # pbar.write(f"Skipped (NotImplementedError)")
             continue
         except RuntimeError as e:
-            if "CUDA out of memory" not in str(e):
+            if not _is_oom_error(e):
                 raise
             if not quiet:
                 pbar.write("Skipped (OOM)")
@@ -602,7 +602,7 @@ def benchmark_run_and_compare(
                     memory = torch.cuda.max_memory_allocated() / 2**20 - mem_begin
                     measurement.mem_use = memory
                 except RuntimeError as e:
-                    if "CUDA out of memory" not in str(e):
+                    if not _is_oom_error(e):
                         raise
                     if not quiet:
                         pbar.write("Skipped (OOM)")
@@ -611,7 +611,7 @@ def benchmark_run_and_compare(
                 if not quiet:
                     pbar.write(f"{name}: memory used: {memory} MB")
         except RuntimeError as e:
-            if "CUDA out of memory" not in str(e):
+            if not _is_oom_error(e):
                 raise
             if not quiet:
                 pbar.write("Skipped (OOM)")
@@ -652,6 +652,8 @@ def matches_current(r):
             results, reference=results_compare_to, atol_s=atol_s, rtol=rtol
         )
 
+def _is_oom_error(e):
+    return isinstance(e, (torch.cuda.OutOfMemoryError, triton.runtime.autotuner.OutOfResources))
 
 def _fail_if_regressions(
     results: List[Any], reference: List[Any], atol_s: float, rtol: float
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 143c74f79..7b1526bb0 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -29,7 +29,7 @@
 )
 
 def _minimum_gemm_alignment(inp: Inputs) -> int:
-    return 1 
+    return 1
 
 
 def _get_seqlen_info(
@@ -86,6 +86,20 @@ def _check_bias_alignment(
                 "you should call `.contiguous()` on the bias"
             )
 
+def _check_large_shapes(reasons: List[str], inp: Inputs) -> None:
+    """CK kernel throws "Memory access fault by GPU node-2" when B * T >= 2**20, might be some index overflow.
+    To reproduce, remove this function and run benchmark_mem_eff_attention with ParlAI model shape (256, 4096, 16, 64).
+    This needs further debugging, for now let's not support such shapes.
+    """
+    b_t_limit = 1024 ** 2
+    q_too_large = inp.query.shape[0] *  inp.query.shape[1] >= b_t_limit
+    k_too_large = inp.key.shape[0] *  inp.key.shape[1] >= b_t_limit
+    v_too_large = inp.value.shape[0] *  inp.value.shape[1] >= b_t_limit
+    if q_too_large or k_too_large or v_too_large:
+        reasons.append(
+            "Input is too large: product of first two dimensions of q/k/v must be < 2**20"
+        )
+
 
 class _CustomMaskType(int, Enum):
     """
@@ -120,7 +134,7 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
 @register_operator
 class FwOp(AttentionFwOpBase):
     """xFormers' MHA kernel based on Composable Kernel.
-    Supports AMD MI 200 and MI 300 GPUs 
+    Supports AMD MI 200 and MI 300 GPUs
     """
 
     OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
@@ -205,6 +219,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         check_lastdim_alignment_stride1(reasons, "query", d.query, matmul_alignment_mn)
         check_lastdim_alignment_stride1(reasons, "value", d.value, matmul_alignment_mn)
         _check_bias_alignment(reasons, d.attn_bias)
+        _check_large_shapes(reasons, d)
         return reasons
 
     @classmethod
@@ -299,6 +314,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                     f"(shape: {tuple(attn_bias_tensor.shape)}"
                     f"/ expected: {expected_bias_shape})"
                 )
+        _check_large_shapes(reasons, d)
         return reasons
 
     @classmethod
@@ -328,7 +344,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             attn_bias=_get_tensor_bias(inp.attn_bias),
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
-            max_seqlen_q=max_seqlen_q, 
+            max_seqlen_q=max_seqlen_q,
             seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,

From 402ee91b829b9816e80fdeef4889d557c7285f95 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 24 Dec 2023 11:12:13 +0000
Subject: [PATCH 311/641] Actually remove submodule composable_kernel_tiled
 from the branch

---
 third_party/composable_kernel_tiled | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 third_party/composable_kernel_tiled

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
deleted file mode 160000
index ddce91a44..000000000
--- a/third_party/composable_kernel_tiled
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ddce91a44b2da6eb74e7e3d7bf14b54930719983

From 79040960e2f7702f057bbc441d6c2694956c2151 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 24 Dec 2023 11:15:38 +0000
Subject: [PATCH 312/641] Change the domain for the repo of composable_kernel
 submodule to ROCm

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 94eb8135c..3017b3887 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,7 +3,7 @@
 	url = https://github.com/NVIDIA/cutlass.git
 [submodule "third_party/composable_kernel"]
 	path = third_party/composable_kernel
-	url = https://github.com/ROCmSoftwarePlatform/composable_kernel.git
+	url = https://github.com/ROCm/composable_kernel.git
 	branch = mha-train-develop
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention

From 44f61609dc17ced61511cb37592c208198388219 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 26 Dec 2023 18:29:10 +0000
Subject: [PATCH 313/641] Update to validate_inputs() in common.py to support
 4d mqa/gqa

---
 xformers/ops/fmha/common.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index bc2c2db76..9808b5934 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -181,11 +181,13 @@ def validate_inputs(self) -> None:
                 and self.value.shape == (B, Mkv, Kv)
             )
         H = self.query.shape[-2]
+        Hkv = self.key.shape[-2]
         if self.query.ndim == 4:  # BMHK
             valid_shapes = (
                 self.query.shape == (B, Mq, H, K)
-                and self.key.shape == (B, Mkv, H, key_embed_dim)
-                and self.value.shape == (B, Mkv, H, Kv)
+                and self.key.shape == (B, Mkv, Hkv, key_embed_dim)
+                and self.value.shape == (B, Mkv, Hkv, Kv)
+                and H % Hkv == 0
             )
         G = self.query.shape[2]
         if self.query.ndim == 5:  # BMNHK

From e03f67aad110bed69289288abdd9ecbe3b7f4aba Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 27 Dec 2023 23:29:41 +0000
Subject: [PATCH 314/641] synchronize test_mem_eff_attention_ck.py with
 test_mem_eff_attention.py

---
 tests/readme_test_on_rocm.txt      |   2 +
 tests/test_mem_eff_attention_ck.py | 953 ++++++++++++++++++++---------
 2 files changed, 674 insertions(+), 281 deletions(-)

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index 16e283ccb..b2b18ff78 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -26,6 +26,8 @@
       * test_unsupported_stride_alignment
       * test_cuda_streams
       * test_dropout
+      * test_backward
+      * test_decoder
 
    4. verify testing for memory_efficient_attention forward (with dropout)
 
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 1b4286c01..ee9c557ab 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -5,22 +5,26 @@
 
 import math
 import random
+from functools import partial
 from typing import List, Optional, Sequence, Tuple, Type, TypeVar
 
 import pytest
 import torch
+import torch.nn.functional as F
 from scipy.stats import binomtest
 from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
+from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
+from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
 from xformers.ops.fmha.common import AttentionOpBase
+from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
 
 from .utils import assert_allclose
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
 _types = [torch.float16, torch.bfloat16]
 
@@ -91,13 +95,14 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
     ]
     # Add some random shapes
     if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
+        fmha.cutlass.FwOp,
+        fmha.cutlass.BwOp,
+        fmha.flash.BwOp,
     ]:
         K_CHOICES = [8 * i for i in range(1, 256 // 8)]
         r = random.Random(0)
         found_count = 0
-        while found_count < 20:
+        while found_count < 200:
             B = r.randint(1, 400)
             Mq = r.randint(1, 500)
             Mkv = r.randint(1, 500)
@@ -146,10 +151,10 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
                         B, Mq, Mkv, H, K, Kv = shape
                         B = min(B, 12)
 
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
+                        if bias_type in {
+                            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+                            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                        }:
                             Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
                         elif (
                             bias_type
@@ -208,8 +213,9 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
 )
 
 
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
     if q.ndim == 5:
+
         def attn_bias_group(group: int):
             if isinstance(attn_bias, torch.Tensor):
                 return attn_bias[:, group]
@@ -222,23 +228,24 @@ def attn_bias_group(group: int):
         return torch.stack(
             [
                 ref_attention_bmhk(
-                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), dtype=dtype
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    scale=scale,
+                    attn_bias=attn_bias_group(g),
                 )
                 for g in range(q.shape[2])
             ],
             dim=2,
         )
-     
     if q.ndim == 4:
         assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias, dtype=dtype)
-    if dtype is None:
-        dtype = torch.float32
-    q = q.to(dtype=dtype)
-    k = k.to(dtype=dtype)
-    v = v.to(dtype=dtype)
-
-    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
     q = q * scale
 
     attn = q @ k.transpose(-2, -1)
@@ -248,23 +255,23 @@ def attn_bias_group(group: int):
             attn_bias_tensor = attn_bias.materialize(
                 (q.shape[0], 1, q.shape[1], k.shape[1]),
                 device=q.device,
-                dtype=dtype,
+                dtype=torch.float32,
             )
         else:
-            attn_bias_tensor = attn_bias.to(dtype=dtype)
+            attn_bias_tensor = attn_bias
         if attn_bias_tensor.ndim == 4:
             assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
             attn_bias_tensor = attn_bias_tensor.reshape(
                 [-1, *attn_bias_tensor.shape[2:]]
             )
-        attn = attn + attn_bias_tensor
+        attn = attn + attn_bias_tensor.float()
     attn = attn.softmax(-1)
     if drop_mask is not None:
         attn = attn * (drop_mask / (1 - p))
     return attn @ v
 
 
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -278,50 +285,11 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
 
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
     idx = {0, total}
@@ -331,158 +299,6 @@ def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     return [e - b for b, e in zip(s[:-1], s[1:])]
 
 
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        if fmt == "BMK":
-            attn_bias = attn_bias[:, 0]
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
 def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
     tensor_with_grad: Optional[torch.Tensor] = None
     if isinstance(attn_bias, torch.Tensor):
@@ -511,18 +327,46 @@ def create_tensors(
     *,
     attn_bias_requires_grad: bool = False,
     fmt: str = "BMK",
+    g: int = 1,
 ):
     torch.manual_seed(B * q_len + kv_len * k + kv)
+
+    mask_is_bottom_right = attn_bias_type is not None and issubclass(
+        attn_bias_type,
+        (
+            fmha.attn_bias.LowerTriangularFromBottomRightMask,
+            fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask,
+            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            fmha.attn_bias.LocalAttentionFromBottomRightMask,
+        ),
+    )
+    if mask_is_bottom_right and q_len > kv_len:
+        # Bottom-right attention and local-attention masks require q_len <= kv_len
+        kv_len = q_len
     scale = 3
     if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype)
+    elif fmt == "BMHK":
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype)
     else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+        assert fmt == "BMGHK"
+        query = torch.randn((B, q_len, g, h, k), device=device, dtype=dtype)
+        key = torch.randn((B, kv_len, g, 1, k), device=device, dtype=dtype)
+        value = torch.randn((B, kv_len, g, 1, kv), device=device, dtype=dtype)
+
+    for x in [query, key, value]:
+        x.mul_(scale)
+
+    if fmt == "BMGHK":
+        # Expand - after the in-place mul
+        key = key.expand((B, kv_len, g, h, k))
+        value = value.expand((B, kv_len, g, h, k))
 
     if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
         attn_bias_type = None
@@ -532,6 +376,7 @@ def create_tensors(
             attn_bias_type,
             batch_size=B,
             num_heads=h,
+            num_heads_groups=g,
             q_len=q_len,
             kv_len=kv_len,
             dtype=dtype,
@@ -578,11 +423,7 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed,
-    fmt,
-):
+def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs):
     (
         op,
         device,
@@ -607,7 +448,9 @@ def test_forward(
         pytest.skip("BMK incompatible with this bias")
 
     query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK" if packed else fmt,
+        **kwargs,
     )
 
     if packed:
@@ -621,6 +464,7 @@ def test_forward(
                 bias_type=bias_type,
                 batch_size=batch_size,
                 num_heads=h,
+                num_heads_groups=1,
                 q_len=q_len,
                 kv_len=kv_len,
                 device=device,
@@ -629,9 +473,11 @@ def test_forward(
                 fmt=fmt,
                 op=op,
             )
-        else:
+        elif fmt == "BMHK":
             # bm3hk -> 3 x bmhk
             query, key, value = xformers.ops.unbind(c, 2)
+        else:
+            assert False, f"Unsupport fmt {fmt} with packing"
         assert not query.is_contiguous()
 
     out = xformers.ops.memory_efficient_attention_forward(
@@ -656,13 +502,14 @@ def test_forward(
     )
 
 
+@cuda_only
 @pytest.mark.parametrize("k_len", [5, 6, 32])
 @pytest.mark.parametrize("batch_size", [1, 4])
 @pytest.mark.parametrize("kv_len", [128, 512])
 @pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("device", [torch.device("cuda")])
 @pytest.mark.parametrize("dtype", _types)
-def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
+def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
+    device = "cuda"
     scale = 3
     query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
     key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
@@ -732,6 +579,35 @@ def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
     assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
 
 
+@cuda_only
+@pytest.mark.parametrize("op", [fmha.cutlass.FwOp, fmha.flash.FwOp])
+def test_logsumexp_mqa(op):
+    if not op.is_available():
+        pytest.skip("not available")
+
+    dtype = torch.float16
+    s = 3
+    query = torch.randn([1, 1, 32, 128], dtype=dtype, device="cuda") * s
+    key = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
+        -1, -1, 32, -1
+    )
+    value = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
+        -1, -1, 32, -1
+    )
+    assert key.stride(2) == 0
+
+    _, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+    )
+    query, key, value = [x[0].transpose(0, 1) for x in [query, key, value]]
+    attn = (query.float() / query.shape[-1] ** 0.5) @ key.float().transpose(-2, -1)
+    ref_lse = attn.logsumexp(-1)
+    assert_allclose(lse[0, :, 0], ref_lse[:, 0], atol=2e-4)
+
+
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("grad_out_contiguous", [False, True])
 @parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
@@ -761,7 +637,7 @@ def test_backward(
         pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
 
     if k % 2 != 0:
-        pytest.skip("head-dim length must be an even value for CK-FlashAttention")
+        pytest.skip("head-dim length must be an even value for CK-FlashAttention")    
 
     if grad_out_contiguous is False:
         pytest.skip("CK-FlashAttention requires grad_out and out have same lengths/strides")
@@ -774,6 +650,12 @@ def test_backward(
         attn_bias_requires_grad=attn_bias_requires_grad,
         fmt=fmt,
     )
+
+    # To understand why we do this, check the comment on the
+    # `AttentionBwOpBase` class
+    scale = None
+    if op_bw.SUPPORTS_CUSTOM_SCALE and query.shape[-1] < 32:
+        scale = (1 / 32) ** 0.5
     op_fw = (
         sample_random_supported_fw(
             fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
@@ -803,10 +685,10 @@ def test_backward(
         pytest.skip("inputs not supported")
 
     out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, op=(op_fw, op_bw)
+        query, key, value, attn_bias, scale=scale, op=(op_fw, op_bw)
     )
 
-    grad_out = torch.ones_like(out)
+    grad_out = torch.randn_like(out)
     if grad_out_contiguous is False:
         grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
             None, None, :
@@ -814,7 +696,7 @@ def test_backward(
 
     out.backward(grad_out)
 
-    if qkv is None and op_bw == fmha.ck.BwOp:
+    if qkv is None and op_bw == fmha.cutlass.BwOp:
         assert query.stride() == query.grad.stride()
 
     grads = []
@@ -831,7 +713,7 @@ def test_backward(
         if attn_bias_grad is not None:
             grads.append(attn_bias_grad)
 
-    ref = ref_attention(query, key, value, attn_bias)
+    ref = ref_attention(query, key, value, attn_bias, scale=scale)
     ref.backward(grad_out)
 
     assert_allclose(
@@ -839,7 +721,7 @@ def test_backward(
         ref.float(),
         "fw pass",
         atol=op_fw.ERROR_ATOL[dtype],
-        rtol=op_fw.ERROR_RTOL.get(dtype, 1e-5),
+        rtol=op_fw.ERROR_RTOL[dtype],
     )
 
     del out
@@ -912,7 +794,6 @@ def _vec_binom_test(x, n, p):
     pval = np.minimum(1.0, pval)
     return pval
 
-
 def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
     if op == fmha.ck.FwOp:
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
@@ -927,7 +808,6 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 
     return mask
 
-
 @cuda_only
 @pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
 @pytest.mark.parametrize("seed", [42, 124])
@@ -944,7 +824,7 @@ def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias
     query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
     key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
     value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-  
+
     inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
     if not op.supports(inputs_for_support_check):
         del query, key, value, attn_bias
@@ -981,11 +861,14 @@ def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias
     p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
     assert all(p_values > p_val_tol)
 
+
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
+    if dtype is torch.bfloat16 and compute_capability < (8, 0):
+        pytest.skip("bf16 requires Sm80")
     if not op.is_available():
         pytest.skip()
 
-    scale = 3
+    scale = 3 
     device = "cuda"
     query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
     key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
@@ -1058,7 +941,7 @@ def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
 @pytest.mark.parametrize("q_len", [2, 33])
 def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
     _test_dropout_backward(
-        q_len, kv_len, batch_size, k, p, op=fmha.ck.FwOp, dtype=torch.float16
+        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
     )
 
 
@@ -1068,30 +951,26 @@ def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize("kv_len", [3, 248, 256])
 @pytest.mark.parametrize("q_len", [3, 248, 256])
-@pytest.mark.parametrize("dt", ["f16", "bf16"])
-def test_dropout_backward_ck(dt, q_len, kv_len, batch_size, k, p):
-    if k > 128:
-        pytest.skip("head-dim size bigger than 128 is not supported by CK-FlashAttention")
-
+@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
+def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
     _test_dropout_backward(
         q_len,
         kv_len,
         batch_size,
         k,
         p,
-        op=fmha.ck.FwOp,
+        op=fmha.cutlass.FwOp,
         dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
     )
 
 
+@cuda_only
 @pytest.mark.parametrize("k_len", [32])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize("kv_len", [3 * 32])
 @pytest.mark.parametrize("q_len", [3 * 32])
-@pytest.mark.parametrize("device", _devices)
-def test_memory_efficient_attention_full_block_masked(
-    device, q_len, kv_len, batch_size, k_len
-):
+def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size, k_len):
+    device = "cuda"
     op_fw = fmha.small_k.FwOp
     op_bw = fmha.small_k.BwOp
 
@@ -1153,11 +1032,11 @@ def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
     value.requires_grad_(True)
 
     out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query, key, value, attn_bias, op=fmha.ck.FwOp
+        query, key, value, attn_bias
     )
     assert out.ndim == query.ndim
     dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
-        grad_out, out, lse, query, key, value, attn_bias, op=fmha.ck.BwOp
+        grad_out, out, lse, query, key, value, attn_bias
     )
     assert dq.shape == query.shape
     assert dk.shape == key.shape
@@ -1232,19 +1111,19 @@ def test_cuda_streams(
 @parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
 def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
     p = 0.0
-    scale = 1.0
+    scale = 0.1
 
     (
         op_bw,
         device,
         dtype,
         _,
-        _,
+        B,
         q_len,
         kv_len,
-        _,
+        H,
         k,
-        _,
+        Kv,
     ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     torch.manual_seed(q_len + kv_len + k)
     if device != "cuda":
@@ -1257,7 +1136,7 @@ def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
         query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
     )
     op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
-    grad_out = torch.ones_like(query)
+    grad_out = query.new_ones(B * H, q_len, Kv)
     query.requires_grad_(True)
     key.requires_grad_(True)
     value.requires_grad_(True)
@@ -1583,20 +1462,16 @@ def test_attn_bias_padded() -> None:
     bsize, n_heads, d, padding = 8, 3, 8, 32
 
     # Q / KV have different seqlen
-    k = torch.randn((bsize, padding, n_heads, d)).cuda().half()
+    k = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
     k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
     other = bsize - 1
-    v = torch.randn((bsize, padding, n_heads, d)).cuda().half()
+    v = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
     n_q_first = 4
     q = [
-        torch.randn((1, n_q_first, n_heads, d)).cuda().half(),
-        torch.randn((1, other, n_heads, d)).cuda().half(),
+        torch.randn((1, n_q_first, n_heads, d), device="cuda", dtype=torch.float16),
+        torch.randn((1, other, n_heads, d), device="cuda", dtype=torch.float16),
     ]
     q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
-    # causal_diagonal = torch.tensor(
-    #     [0] + [i - 1 for i in k_seqlen[1:]], dtype=torch.int32
-    # ).cuda()
-
     q_seqlen = [n_q_first] + [1] * other
 
     attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
@@ -1635,8 +1510,8 @@ def test_attn_bias_padded() -> None:
     assert_allclose(
         output,
         fmha_output,
-        atol=fmha.ck.FwOp.ERROR_ATOL[torch.float16],
-        rtol=fmha.ck.FwOp.ERROR_RTOL[torch.float16],
+        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
+        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
     )
 
 
@@ -1647,7 +1522,6 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
         return "mq"
     return f"gqa{kv_heads}"
 
-
 @pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
@@ -1709,17 +1583,16 @@ def test_decoder(
     decoder_output = fmha.memory_efficient_attention_forward(
         q, k, v, attn_bias, op=op
     )
-    
-    ref_output = ref_attention(q, k, v, attn_bias, dtype=dtype_)
+
+    ref_output = ref_attention(q, k, v, attn_bias)
 
     assert_allclose(
-        decoder_output,
+        decoder_output.float(),
         ref_output,
         atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
         rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
     )
 
-
 def test_attn_bias_from_seqlens() -> None:
     bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
     out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
@@ -1752,7 +1625,6 @@ def test_attn_bias_blockdiag_doc() -> None:
     q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
     out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None))
     list_out = attn_bias.split(out)
-    print(list_out[0].shape)  # [1, 3, 1, K]
     assert tuple(list_out[0].shape) == (1, 3, 1, K)
 
 
@@ -1785,22 +1657,21 @@ def pad_bias(bias: torch.Tensor) -> torch.Tensor:
 
     def test_f16_biasf32(self) -> None:
         q, k, v, bias = self.create_tensors(torch.float16)
-        fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(fmha.ck.FwOp, None))
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
         bias = bias.to(torch.float32)
         with pytest.raises((ValueError, RuntimeError)):
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(fmha.ck.FwOp, None))
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
 
     def test_f32_biasf16(self) -> None:
-        pytest.skip("float32 is not supported currently by CK-FlashAttention")
         q, k, v, bias = self.create_tensors(torch.float32)
         fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
         bias = bias.to(torch.float16)
         with pytest.raises((ValueError, RuntimeError)):
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
 
-    @pytest.mark.parametrize("dtype", [torch.float16])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
     def test_wrong_alignment(self, dtype) -> None:
-        op = fmha.ck.FwOp
+        op = fmha.cutlass.FwOp
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
         try:
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
@@ -1820,7 +1691,7 @@ def test_wrong_alignment(self, dtype) -> None:
         )
 
     def test_permuted_attn_bias(self) -> None:
-        op = fmha.ck.FwOp
+        op = fmha.cutlass.FwOp
         dtype = torch.float16
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
         bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
@@ -1837,4 +1708,524 @@ def test_permuted_attn_bias(self) -> None:
         except (ValueError, RuntimeError):
             pass
 
+
+SM_AND_SHMEM_KBYTES = [
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+    (50, 64),
+    (60, 64),
+    (70, 96),
+    (75, 64),
+    (80, 163),
+    (86, 99),
+    (89, 99),
+    # (90, 227),
+]
+
+
+@cuda_only
+@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
+@pytest.mark.parametrize(
+    "sm_shmem",
+    SM_AND_SHMEM_KBYTES,
+    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
+)
+def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
+    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
+    sm, shmem_kbytes = sm_shmem
+    if sm < 80 and dtype_str == "bf16":
+        return
+
+    for k in [16, 32, 64, 128, 256]:
+        assert torch.ops.xformers._has_cutlassF_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+        assert torch.ops.xformers._has_cutlassB_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+
+
+def test_window_size_materialize() -> None:
+    seqlens = [4, 6]
+    attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+        q_seqlen=seqlens,
+        kv_seqlen=seqlens,
+    ).make_local_attention(2)
+    mask = attn_bias.materialize(
+        (1, 1, sum(seqlens), sum(seqlens)),
+        device="cpu",
+        dtype=torch.float32,
+    )
+    true_mask = torch.log(
+        torch.Tensor(
+            [
+                [
+                    [
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
+                    ]
+                ]
+            ]
+        )
+    )
+    assert torch.all(mask == true_mask)
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "opFW_biasT",
+    [
+        (op, biasT)
+        for op in ALL_FW_OPS
+        for biasT in op.SUPPORTED_ATTN_BIAS_TYPES
+        if op.SUPPORTS_BMGHK
+    ],
+)
+def test_forward_gqa(opFW_biasT):
+    opFW, biasT = opFW_biasT
+    B_Mq_Mkv_H_K_Kv = (3, 512, 512, 16, 128, 128)
+    test_forward(
+        (
+            opFW,
+            "cuda",
+            torch.float16,
+            biasT,
+            *B_Mq_Mkv_H_K_Kv,
+        ),
+        packed=False,
+        fmt="BMGHK",
+        g=2,
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "opBW",
+    [
+        fmha.flash.BwOp,
+        fmha.cutlass.BwOp,
+    ],
+)
+def test_backward_gqa(opBW):
+    H = 8
+    B_Mq_Mkv_H_K_Kv = (3, 512, 512, H, 128, 128)
+    dtype = torch.float16
+    query, key, value, attn_bias = create_tensors(
+        *(opBW, "cuda", dtype, type(None), *B_Mq_Mkv_H_K_Kv),
+        attn_bias_requires_grad=False,
+        fmt="BMHK",
+    )
+    op = (fmha.cutlass.FwOp, opBW)
+    key = key[:, :, :1].expand(-1, -1, H, -1)
+    value = value[:, :, :1].expand(-1, -1, H, -1)
+    key.requires_grad_(True)
+    out = fmha.memory_efficient_attention(query, key, value, attn_bias=attn_bias)
+    out_ref = ref_attention_bmhk(query, key, value, attn_bias=attn_bias)
+    assert_allclose(
+        out.float(),
+        out_ref.float(),
+        atol=op[0].ERROR_ATOL[dtype],
+        rtol=op[0].ERROR_RTOL[dtype],
+    )
+    out.backward(query)
+    dk = key.grad
+    key.grad = None
+    out_ref.backward(query)
+    assert_allclose(
+        dk.float(),
+        key.grad.float(),
+        atol=op[1].ERROR_ATOL[dtype],
+        rtol=op[1].ERROR_RTOL[dtype],
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("opFW", [op for op in ALL_FW_OPS if op.SUPPORTS_BMGHK])
+def test_forward_gqa_one_group(opFW):
+    dtype = torch.float16
+    B, Mq, Mkv, H, K = 3, 13, 16, 5, 128
+    q = torch.randn([B, Mq, 1, H, K], dtype=dtype, device="cuda") * 3
+    k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
+    v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
+
+    supported = opFW.supports(fmha.Inputs(q, k, v))
+    if not supported:
+        supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
+        assert supported == supported_bmhk
+        pytest.skip("not supported")
+    out = fmha.memory_efficient_attention_forward(q, k, v, op=opFW)
+    ref = ref_attention(q, k, v)
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=opFW.ERROR_ATOL[dtype],
+        rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+'''
+@sm80_or_better_only
+def test_flash_gqa_wrong_strides() -> None:
+    op = (fmha.flash.FwOp, None)
+    device = "cuda"
+    B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
+    q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
+    kv = torch.empty((B, Mkv, G, H, K), dtype=torch.float16, device=device)
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, H, G, K), dtype=torch.float16, device=device).permute(
+        0, 1, 3, 2, 4
+    )
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, G, 1, K), dtype=torch.float16, device=device)
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, kv, kv, op=op)
+    kv = kv.expand(-1, -1, -1, H, K)
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, G, H, 2 * K), dtype=torch.float16, device=device)[
+        :, :, :, :, :K
+    ]
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+'''
+
+def _dispatches_to_splitK(q, kv):
+    return (
+        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0]
+        is fmha.triton_splitk.FwOp
+    )
+
+
+def _dispatches_to_flash_decoding(q, kv):
+    return (
+        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
+    )
+
+
+def test_dispatch_decoding_bmhk() -> None:
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
+    ), "Should not use SplitK with 1 head (no tensorcores)"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 32, 128]),
+        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should use Flash-Decoding with BMHK MQA"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 32, 128]),
+        torch.empty([1, 2048, 32, 128]),
+    ), "Should not use SplitK when no TensorCores"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 128, 32, 128]),
+        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should not use SplitK if q seqlen is long"
+    assert not _dispatches_to_splitK(
+        torch.empty([128, 8, 32, 128]),
+        torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should not use SplitK if B is big"
+
+
+def test_dispatch_decoding_bmghk() -> None:
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
+    ), "Should not use SplitK with 1 head (no tensorcores)"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should use Flash-Decoding with MQA"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 4, 32, 128]),
+        torch.empty([1, 2048, 4, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should use Flash-Decoding with GQA"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 32, 128]),
+    ), "Should not use SplitK when no TensorCores"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 128, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should not use SplitK if q seqlen is long"
+    assert not _dispatches_to_splitK(
+        torch.empty([128, 8, 1, 32, 128]),
+        torch.empty([128, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should not use SplitK if B is big"
+
+
+shapes_triton_splitk = [
+    (1, 8, 2**16, 1, 128, 128),
+    (1, 4, 2**16, 1, 128, 128),
+    (1, 16, 2**16, 1, 128, 128),
+    (1, 16, 2**16, 1, 32, 32),
+    (1, 8, 1025, 1, 128, 128),
+    (2, 8, 4096, 1, 128, 128),
+    (10, 8, 2**16, 1, 128, 128),
+    (10, 15, 2**16, 1, 128, 128),
+    (1, 3, 2**16, 1, 128, 128),
+    (1, 3, 2**16 - 10, 1, 128, 128),
+    (2, 3, 73, 1, 128, 128),
+    (2, 7, 7328, 1, 128, 128),
+    (2, 7, 7328, 1, 120, 120),
+    (2, 7, 63, 1, 120, 120),
+]
+op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk = [
+    (fmha.triton_splitk.FwOp, "cuda", torch.float16, type(None), *s)
+    for s in shapes_triton_splitk
+] + [
+    (fmha.triton_splitk.FwOp, "cuda", torch.bfloat16, type(None), *s)
+    for s in shapes_triton_splitk
+]
+
+
+@pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk,
+    ids=[make_id(*c) for c in op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk],
+)
+@cuda_only
+def test_forward_splitk(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed=False,
+    fmt="BMHK",
+):
+    test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed=packed, fmt=fmt)
+
+
+@cuda_only
+@pytest.mark.parametrize("op", [fmha.triton_splitk.FwOp])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "B_Mkv_H_K",
+    [
+        (1, 2**16, 3, 128),
+        (5, 53, 4, 64),
+    ],
+)
+def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
+    B, Mkv, H, K = B_Mkv_H_K
+    q = torch.randn([B, 1, H, K], dtype=dtype, device="cuda") * 3
+    k = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
+    v = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
+    k = k.expand(-1, -1, H, -1)
+    v = v.expand(-1, -1, H, -1)
+
+    if not op.supports(fmha.Inputs(q, k, v)):
+        pytest.skip("not supported")
+    out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
+    ref = ref_attention(q, k, v)
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_query(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    query = query[:, :0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    assert out.shape[1] == 0
+    out.backward(out)
+    # dK/dV should be all zeros
+    assert_allclose(key.grad, torch.zeros_like(key.grad), "key.grad")
+    assert_allclose(value.grad, torch.zeros_like(value.grad), "value.grad")
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_kv(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    key = key[:, :0]
+    value = value[:, :0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    assert_allclose(out, torch.zeros_like(out), "out")
+    out.backward(out)
+    # dQ should be all zeros
+    assert_allclose(query.grad, torch.zeros_like(query.grad), "query.grad")
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_b(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    query, key, value = query[:0], key[:0], value[:0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    out.backward(out)
+
+
+def test_local_attn_bias() -> None:
+    mask = (
+        fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
+        .materialize(shape=(4, 4))
+        .exp()
+    )
+
+    expected = torch.tensor(
+        [[1, 1, 1, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.float32
+    )
+    assert (mask == expected).all().item()
+
+
+@cuda_only
+@pytest.mark.parametrize("cc", [60, 70, 80])
+@pytest.mark.parametrize("maxK", [32, 64, 128, 256])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "custom_mask_type",
+    [
+        fmha.cutlass._CustomMaskType.NoCustomMask,
+        fmha.cutlass._CustomMaskType.CausalFromTopLeft,
+        fmha.cutlass._CustomMaskType.CausalFromBottomRight,
+    ],
+)
+@pytest.mark.parametrize("window_size", [0, 3, 300])
+@pytest.mark.parametrize(
+    "num_queries,num_keys",
+    [
+        (30, 66),
+        (256, 256),
+        # Edge cases
+        (314, 320),
+        (32, 256),
+        (224, 226),
+        (5, 531),
+        (320, 332),  # for win_size=300
+        # Others
+        (256, 62),
+        (256, 63),
+        (256, 64),
+        (256, 65),
+        (256, 66),
+    ],
+)
+def test_cutlassB_iter_order(
+    dtype,
+    cc: int,
+    maxK: int,
+    num_queries: int,
+    num_keys: int,
+    custom_mask_type,
+    window_size,
+) -> None:
+    """
+    This tests some internals of the cutlassB kernel
+    We test the iteration across blocks of [queries, keys] to ensure
+    that we correctly:
+    * Iterate over all the blocks that should be iterated
+    * Do *not* iterate over blocks that are completely masked out
+    * Correctly compute the number of parallel blocks that will compute
+        the same block of dQ
+    .. and we test this across variable causal masks+local attention combinations
+    """
+    if (
+        window_size > 0
+        and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
+    ):
+        pytest.skip("LocalAttention is only supported for causal")
+    get_iteration_data = partial(
+        torch.ops.xformers._cutlassB_iteration_data,
+        dtype=dtype,
+        cc=cc,
+        maxK=maxK,
+        num_queries=num_queries,
+        num_keys=num_keys,
+        custom_mask_type=custom_mask_type,
+        window_size=window_size,
+    )
+    bias = torch.zeros([num_queries, num_keys], dtype=torch.float32)
+    if custom_mask_type != fmha.cutlass._CustomMaskType.NoCustomMask:
+        bias = fmha.attn_bias._materialize_causal_mask(
+            (num_queries, num_keys),
+            dtype=torch.float32,
+            device="cpu",
+            window_size=None if window_size == 0 else window_size,
+            from_bottomright=(
+                custom_mask_type == fmha.cutlass._CustomMaskType.CausalFromBottomRight
+            ),
+        )
+
+    block_queries, block_keys = get_iteration_data()[:2]
+    mask_pooled = (
+        F.max_pool2d(bias.unsqueeze(0), (block_queries, block_keys), ceil_mode=True)
+        == 0
+    ).int()[0]
+    attn_computed = torch.zeros_like(mask_pooled)
+    for key_start in range(0, num_keys, block_keys):
+        it = 0
+        new_key_start = key_start
+        new_query_start = get_iteration_data(key_start=key_start)[2]
+        try:
+            expected_first_query = (
+                mask_pooled[:, key_start // block_keys].tolist().index(1)
+                * block_queries
+            )
+            assert (
+                new_query_start == expected_first_query
+            ), f"Wrong first query for K={key_start}: {new_query_start} (expected {expected_first_query})"
+        except ValueError:  # Nothing to compute in this column
+            pass
+
+        while new_key_start == key_start and new_query_start < num_queries:
+            query_start = new_query_start
+            attn_computed[query_start // block_queries, key_start // block_keys] += 1
+            # print(f"Compute [{query_start}, {key_start}]")
+
+            # Is there something to compute here?
+            assert mask_pooled[
+                query_start // block_queries, key_start // block_keys
+            ].item(), "Computing a block that is not needed!"
+            new_query_start, new_key_start = get_iteration_data(
+                key_start=key_start, query_start=query_start
+            )[3:5]
+            it += 1
+            assert it < num_queries, ""
+        assert (attn_computed == mask_pooled)[
+            :, key_start // block_keys
+        ].all(), "some blocks were not computed!"
+
+    # Now check that the number returned by `getNumParallelBlocksForQuery` is correct
+    for query_start in range(0, num_queries, block_queries):
+        num_parallel_blocks = get_iteration_data(
+            query_start=query_start, num_splits_key=num_keys
+        )[5]
+        num_actual = mask_pooled[query_start // block_queries].sum().item()
+        assert num_parallel_blocks == num_actual
+
+
 # end of file

From 6aef46d7905883be6bd9e25de1bc18eba95e12c4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 28 Dec 2023 00:16:56 +0000
Subject: [PATCH 315/641] Tiny update in benchmark_mem_eff_attn_decoder_ck.py

---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
index bfbe4c35b..86d4813cf 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
@@ -13,7 +13,6 @@
 
 import xformers.ops
 import xformers.ops.fmha as fmha
-import xformers.profiler.slow_ops_profiler
 
 torch.backends.cuda.matmul.allow_tf32 = False
 

From 4a1cea0d1f44204afc97e4518a6bfd13f513acff Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 28 Dec 2023 00:29:33 +0000
Subject: [PATCH 316/641] Synchronize benchmark_mem_eff_attention_ck.py with
 benchmark_mem_eff_attention.py

---
 .../benchmark_mem_eff_attention_ck.py         | 131 +++++++++++-------
 1 file changed, 79 insertions(+), 52 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
index 0c754d8c1..e683a7f06 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
@@ -14,31 +14,11 @@
 
 import xformers.ops
 import xformers.ops.fmha as fmha
+from xformers.attn_bias_utils import create_attn_bias
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    bias_requires_grad: bool = False,
-):
-    NoneType = type(None)
-    if bias_type is NoneType:
-        return None
-    if bias_type is torch.Tensor:
-        attn_bias = torch.randn((1, 1, q_len, kv_len), device=device, dtype=dtype)
-        return attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-    if bias_type is xformers.ops.LowerTriangularMask:
-        return bias_type()
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
 def ref_attention_bmk(q, k, v, attn_bias=None, p=0.0):
     if isinstance(attn_bias, xformers.ops.AttentionMask):
         attn_bias = (
@@ -160,6 +140,12 @@ def product_dict(**kwargs):
                 {"attn_bias_cfg": (torch.Tensor, False)},
                 {"attn_bias_cfg": (torch.Tensor, True)},
                 {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
+                {
+                    "attn_bias_cfg": (
+                        xformers.ops.fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+                        False,
+                    )
+                },
                 {"dtype": torch.bfloat16},
                 ##{"dtype": torch.float},
             ]
@@ -168,31 +154,40 @@ def product_dict(**kwargs):
     CASES.append(c)
 
 
-def create_tensors(shape, dtype, requires_grad=False):
-    B, M, H, K = shape
+def create_tensors(shape, dtype, requires_grad=False, packed=True, multiquery=False):
+    stacked_shape = list(shape)  # B, M, H, K
+    stacked_dim = 2 if packed else 0
+    stacked_shape.insert(stacked_dim, 3)
     qkv = torch.rand(
-        [B, M, 3, H, K], device=device, dtype=dtype, requires_grad=requires_grad
+        stacked_shape, device=device, dtype=dtype, requires_grad=requires_grad
     )
-    q, k, v = xformers.ops.unbind(qkv, 2)
+    q = torch.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
+    shape_kv = (shape[0], shape[1], 1 if multiquery else shape[2], shape[3])
+    k = torch.rand(
+        shape_kv, device=device, dtype=dtype, requires_grad=requires_grad
+    ).expand(shape)
+    v = torch.rand(
+        shape_kv, device=device, dtype=dtype, requires_grad=requires_grad
+    ).expand(shape)
     return qkv, q, k, v
 
-def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+
+def mem_eff_attention_fw(
+    shape,
+    num_threads: int,
+    attn_bias_cfg,
+    dropout_p,
+    dtype,
+    packed=True,
+    multiquery=False,
+):
     B, M, H, K = shape
-    _, q, k, v = create_tensors(shape, dtype)
+    _, q, k, v = create_tensors(
+        shape, dtype, requires_grad=False, packed=packed, multiquery=multiquery
+    )
     attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
     if attn_bias_requires_grad:
         return
-    bias = create_attn_bias(
-        attn_bias_type,
-        batch_size=B,
-        num_heads=H,
-        q_len=M,
-        kv_len=M,
-        device=device,
-        dtype=dtype,
-        bias_requires_grad=attn_bias_requires_grad,
-    )
-    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
 
     dtype_str = {
         torch.bfloat16: "b16",
@@ -206,6 +201,28 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
 
     has_run = False
     for fw_op, bw_op in OPS:
+        bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=H,
+            num_heads_groups=1,
+            q_len=M,
+            kv_len=M,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt="BMHK",
+            op=fw_op,
+        )
+        inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+        if isinstance(
+            bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            q, k, v = [x.reshape([1, -1, *x.shape[2:]]) for x in [q, k, v]]
         if not fw_op.supports(inp):
             continue
 
@@ -250,20 +267,9 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
 
 def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
     B, M, H, K = shape
-    _, q, k, v = create_tensors(shape, dtype, requires_grad=True)
+    qkv, q, k, v = create_tensors(shape, dtype, requires_grad=True)
 
     attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
-    bias = create_attn_bias(
-        attn_bias_type,
-        batch_size=B,
-        num_heads=H,
-        q_len=M,
-        kv_len=M,
-        device=device,
-        dtype=dtype,
-        bias_requires_grad=attn_bias_requires_grad,
-    )
-    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
 
     dtype_str = {
         torch.bfloat16: "b16",
@@ -277,6 +283,21 @@ def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
 
     has_run = False
     for fw_op, bw_op in OPS:
+        bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=H,
+            num_heads_groups=1,
+            q_len=M,
+            kv_len=M,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt="BMHK",
+            op=bw_op,
+        )
+        inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+
         if not fw_op.supports(inp) or not bw_op.supports(inp):
             continue
         has_run = True
@@ -312,5 +333,11 @@ def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
         num_threads=num_threads,
     )
 
-benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
-benchmark_main_helper(mem_eff_attention_bw, CASES, min_run_time=min_run_time)
+
+def main():
+    benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
+    benchmark_main_helper(mem_eff_attention_bw, CASES, min_run_time=min_run_time)
+
+
+if __name__ == "__main__":
+    main()

From c5ca494c8cd89ad977504569a28434c4faf7fc2b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 28 Dec 2023 22:31:34 +0000
Subject: [PATCH 317/641] Remove benchmark_mem_eff_attn_decoder_ck_tiled.py

---
 ...benchmark_mem_eff_attn_decoder_ck_tiled.py | 210 ------------------
 1 file changed, 210 deletions(-)
 delete mode 100644 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
deleted file mode 100644
index 1e8239ace..000000000
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck_tiled.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import itertools
-from functools import partial
-
-import torch
-from torch.utils import benchmark
-from utils import benchmark_main_helper
-
-import xformers.ops
-import xformers.ops.fmha as fmha
-import xformers.profiler.slow_ops_profiler
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-# Run with
-#  python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py --omit-baselines --quiet
-# The baselines for these benchmarks are really slow because there is
-# so much padding in the inputs, so there is no point running them.
-
-
-def ref_attention_bmk(q, k, v, attn_bias=None):
-    if isinstance(attn_bias, xformers.ops.AttentionMask):
-        attn_bias = (
-            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
-            .to(q)
-            .squeeze()
-        )
-    q = q * (1.0 / q.shape[-1] ** 0.5)
-    if attn_bias is None:
-        attn = q @ k.transpose(-2, -1)
-    else:
-        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
-        # but faster, and is what is used in PyTorch now
-        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
-    attn = attn.softmax(-1)
-    return attn @ v
-
-
-def ref_attention(q, k, v, attn_bias):
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-min_run_time = 0.5
-device = torch.device("cuda")
-
-NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
-
-OPS = [
-    xformers.ops.fmha.ck.FwOp,
-    ##xformers.ops.fmha.ck_decoder.FwOp
-]
-
-KV_SHAPES = [
-    # list of n_keys, padding_length, batchsize
-    (2, 64, 3),
-    (32, 1024, 500),  
-    (1000, 1024, 2),
-    (8000, 8192, 1),
-    (240, 256, 32),
-    (2048, 2 * 1024, 4),
-    (4096 * 2, 8 * 1024, 1),
-]
-
-N_HEADS = [8, 16, 64]
-
-
-def product_dict(**kwargs):
-    keys = kwargs.keys()
-    vals = kwargs.values()
-    for instance in itertools.product(*vals):
-        yield dict(zip(keys, instance))
-
-
-CASES = list(
-    product_dict(
-        kv_shape=KV_SHAPES,
-        n_heads=N_HEADS,
-        num_threads=NUM_THREADS,
-        multiquery=[True, False],
-    )
-)
-
-def get_memory_traffic(op, q, k, v, bias):
-    # mem_size = ( batch_size * seq_len * 1 * dim_per_head * 2 (K/V) + 
-    #              batch_size * 1 * num_heads * dim_per_head (Q) + 
-    #              batch_size * seq_len * num_heads * dim_per_head (attn_output) ) * bytes_per_element
-    out = xformers.ops.memory_efficient_attention_forward(q, k, v, bias, op=op)
-    dtype = q.dtype
-    multiquery = k.stride(2) == 0
-    n_heads = q.shape[-2]
-    dim_per_head = q.shape[-1]
-    kv_seqlen = bias.k_seqinfo.seqlen_py
-    bytes_per_element = 4 if dtype is torch.float32 else 2 if dtype in (torch.float16, torch.bfloat16) else None
-    mem_size = 0
-    mem_size += q.numel() * bytes_per_element # Q
-    for s in kv_seqlen: # len(kv_seqlen) == batch_size
-        mem_size += s * (1 if multiquery else n_heads) * dim_per_head * bytes_per_element * 2 # K, V
-    mem_size += out.numel() * bytes_per_element # attn_output
-    return mem_size
-
-def mem_eff_attention_decoder(
-    kv_shape, n_heads: int, num_threads: int, multiquery: bool
-):
-    n_keys, padding, B = kv_shape
-    torch.manual_seed(42)
-    k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
-    K = 128
-    ##dtype = torch.bfloat16
-    dtype = torch.float16
-    q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
-    if multiquery:
-        k = torch.rand(
-            1, B * padding, 1, K, device=device, dtype=dtype
-        ).expand(1, B * padding, n_heads, K)
-        v = torch.rand(
-            1, B * padding, 1, K, device=device, dtype=dtype
-        ).expand(1, B * padding, n_heads, K)
-    else:
-        k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
-        v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
-
-    bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[1] * B,
-        kv_seqlen=k_seqlen,
-        kv_padding=padding,
-    )
-
-    sub_label = f"{B}batch-{k_seqlen[0]}keys-{n_heads}heads"
-    if multiquery:
-        sub_label += "-mq"
-
-    has_run = False
-
-    for fw_op in OPS:
-        inp = fmha.Inputs(q, k, v, attn_bias=bias)
-        if (skip_reasons := fw_op.not_supported_reasons(inp)):
-            print(f"Skip benchmark: {skip_reasons=}")
-            continue
-
-        fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
-
-        mem_size = get_memory_traffic(fw_op, q, k, v, bias)
-
-        yield benchmark.Timer(
-            stmt=f"fn(q, k, v, attn_bias)",
-            globals={
-                "q": q,
-                "k": k,
-                "v": v,
-                "attn_bias": bias,
-                "fn": fn,
-            },
-            label="attention",
-            description=fw_op.NAME,
-            sub_label=f"{sub_label}_{mem_size//1024}k",
-            num_threads=num_threads,
-        )
-
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph):
-            fn(q, k, v, bias)
-        yield benchmark.Timer(
-            stmt="graph.replay()",
-            globals={
-                "graph": graph,
-            },
-            label="cuda graphed attention",
-            description=fw_op.NAME,
-            sub_label=f"{sub_label}_{mem_size//1024}k",
-            num_threads=num_threads,
-        )
-
-        has_run = True
-
-    if not has_run:
-        return
-
-    RUN_BASELINES = False
-    if RUN_BASELINES:
-        yield benchmark.Timer(
-            stmt="fn(q, k, v, attn_bias)",
-            globals={
-                "q": q,
-                "k": k,
-                "v": v,
-                "attn_bias": bias,
-                "fn": ref_attention,
-            },
-            label="attention",
-            description="eager",
-            sub_label=sub_label,
-            num_threads=num_threads,
-        )
-
-
-benchmark_main_helper(mem_eff_attention_decoder, CASES, min_run_time=min_run_time)

From 8ebfd5fa745d6f62a5aca8b27bb69ac7885d8b8d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 3 Jan 2024 23:06:53 +0000
Subject: [PATCH 318/641] Support for Generic Attention Mask Coordinate

---
 setup.py                                      |   5 +-
 third_party/composable_kernel_tiled           |   2 +-
 xformers/csrc/attention/attention.cpp         |   8 +
 .../attention_forward_generic_ck_tiled.cpp    |   5 +-
 .../attention/hip_fmha/ck_tiled_bool_switch.h |   9 +
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 130 ++++---
 .../ck_tiled_fmha_batched_infer_bp16.cpp      |  44 +--
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  44 +--
 .../hip_fmha/ck_tiled_fmha_definitions.h      |   4 +-
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 333 +++++++++---------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  80 +++--
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      |  44 +--
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  44 +--
 .../attention/hip_fmha/ck_tiled_fmha_params.h |   2 +
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |  13 -
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |  13 -
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |  13 -
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |  13 -
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |  13 -
 ..._infer_bp16_no_causalmask_no_attnbias.cpp} |   5 +-
 ...nfer_bp16_no_causalmask_with_attnbias.cpp} |   5 +-
 ...nfer_bp16_with_causalmask_no_attnbias.cpp} |   5 +-
 ...er_bp16_with_causalmask_with_attnbias.cpp} |   5 +-
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |  13 -
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |  13 -
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |  13 -
 ...d_infer_fp16_no_causalmask_no_attnbias.cpp |  12 +
 ...infer_fp16_no_causalmask_with_attnbias.cpp |  12 +
 ...infer_fp16_with_causalmask_no_attnbias.cpp |  12 +
 ...fer_fp16_with_causalmask_with_attnbias.cpp |  12 +
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |  13 -
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |  13 -
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |  13 -
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |  13 -
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |  13 -
 ..._infer_bp16_no_causalmask_no_attnbias.cpp} |   5 +-
 ...nfer_bp16_no_causalmask_with_attnbias.cpp} |   5 +-
 ...nfer_bp16_with_causalmask_no_attnbias.cpp} |   5 +-
 ...er_bp16_with_causalmask_with_attnbias.cpp} |   5 +-
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |  13 -
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |  13 -
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |  13 -
 ...d_infer_fp16_no_causalmask_no_attnbias.cpp |  12 +
 ...infer_fp16_no_causalmask_with_attnbias.cpp |  12 +
 ...infer_fp16_with_causalmask_no_attnbias.cpp |  12 +
 ...fer_fp16_with_causalmask_with_attnbias.cpp |  12 +
 xformers/ops/fmha/ck.py                       |   1 +
 47 files changed, 488 insertions(+), 611 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp} (58%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp} (58%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp} (58%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp} (58%)
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp} (58%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp} (58%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp} (58%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp} (58%)
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp

diff --git a/setup.py b/setup.py
index 517a78b63..84629d229 100644
--- a/setup.py
+++ b/setup.py
@@ -346,7 +346,10 @@ def get_extensions():
        else:
            include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include']
            
-       generator_flag = []
+       if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
+           generator_flag = ["-DUSE_CK_TILED_KERNEL"]
+       else:
+           generator_flag = []
        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
        extra_compile_args={
             "cxx": ["-O3", "-std=c++17"] + generator_flag,
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 3ffae938a..afea7392d 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 3ffae938aca3d595cdae4e89564a6d063c09d0b5
+Subproject commit afea7392d59cbd71247336483f5cf190c0929866
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 5b379a724..3989ebd29 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -25,11 +25,19 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 #if defined(USE_ROCM)
+#if defined(USE_CK_TILED_KERNEL)
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_ck(Tensor query, "
+          "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
+          "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
+          "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
+#else
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, "
           "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
           "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
           "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
+#endif
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
           "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index dbaecf40f..d63f0d6bf 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -65,7 +65,8 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
     bool compute_logsumexp,
     int64_t custom_mask_type,
     c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k)
+    const c10::optional<at::Tensor>& seqlen_k,
+    const c10::optional<int64_t> window_size)
 {
     TORCH_CHECK(query.dim() == 4);
     TORCH_CHECK(key.dim() == 4);
@@ -206,6 +207,7 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
             p.has_attn_bias = false;
 
         p.custom_mask_type = custom_mask_type;
+        p.window_size      = window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
 
         p.use_dropout       = use_dropout;
         p.philox_seed       = philox_seed;
@@ -287,6 +289,7 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
             p.has_attn_bias = false;
 
         p.custom_mask_type = custom_mask_type;
+        p.window_size      = window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
 
         // max_seqlen_q is used to create logsumexp tensor
         p.max_seqlen_q = *max_seqlen_q_;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h
new file mode 100644
index 000000000..c07559a3c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <ck/utility/static_switch.hpp>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 9ad19cb6f..2ea3d4f50 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -25,6 +25,7 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck/tile_program/block_tile/block_masking.hpp>
 
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
@@ -32,8 +33,10 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_fmha_definitions.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_infer_masktype_attnbias_dispatched
+#include "ck_tiled_bool_switch.h"
+
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+struct batched_infer_causalmask_attnbias_dispatched
 {
     using QDataType           = scalar_t;
     using KDataType           = scalar_t;
@@ -47,9 +50,6 @@ struct batched_infer_masktype_attnbias_dispatched
 
     using VLayout = ck::tensor_layout::gemm::RowMajor;
 
-    static constexpr auto masktype = static_cast<CausalMaskType>(custom_mask_type);
-    using FmhaCausalMask           = typename CausalMaskPredicate<masktype>::predicate;
-
     using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
     using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
     using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
@@ -89,7 +89,7 @@ struct batched_infer_masktype_attnbias_dispatched
     }()
 #endif
 
-    template <typename FmhaTraits, typename FmhaShape>
+    template <typename FmhaTraits, typename FmhaShape, typename FmhaMask>
     using FmhaPipelineProblemTemp =
         ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
                                                           KDataType,
@@ -103,54 +103,73 @@ struct batched_infer_masktype_attnbias_dispatched
                                                           256, // BlockSize
                                                           FmhaShape,
                                                           false, // kIsGroupMode
-                                                          FmhaCausalMask,
+                                                          FmhaMask,
                                                           FmhaTraits>;
 
     static void Run(BatchedForwardParams& param, hipStream_t stream)
     {
-        BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-            using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-
-            if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
-            {
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<false, false, has_attn_bias>;
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-            }
-            else if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 != 0)
-            {
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<false, true, has_attn_bias>;
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-            }
-            else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 == 0)
-            {
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<true, false, has_attn_bias>;
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-            }
-            else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 != 0)
-            {
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaShape>;
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-            };
+        const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+            using FmhaMask =
+                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+            BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+                using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+
+                if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
+                {
+                    using FmhaTraits =
+                        ck::tile_program::TileFmhaTraits<false, false, has_attn_bias>;
+
+                    using FmhaPipelineProblem =
+                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                }
+                else if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 != 0)
+                {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<false, true, has_attn_bias>;
+                    using FmhaPipelineProblem =
+                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                }
+                else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 == 0)
+                {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<true, false, has_attn_bias>;
+                    using FmhaPipelineProblem =
+                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                }
+                else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 != 0)
+                {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
+                    using FmhaPipelineProblem =
+                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                };
+            });
         });
     };
 
@@ -184,7 +203,9 @@ struct batched_infer_masktype_attnbias_dispatched
                 param.k_strides[0],
                 param.v_strides[0],
                 param.attn_bias_strides[0],
-                param.out_strides[0]);
+                param.out_strides[0],
+                static_cast<CausalMaskType>(param.custom_mask_type),
+                param.window_size);
         }();
 
         dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
@@ -196,9 +217,10 @@ struct batched_infer_masktype_attnbias_dispatched
     };
 };
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_infer_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+void run_batched_infer_causalmask_attnbias_dispatched(BatchedForwardParams& param,
+                                                      hipStream_t stream)
 {
-    batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+    batched_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias>::Run(
         param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index c45f4ba00..815fee897 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -8,45 +8,33 @@
 #include <ck/utility/data_type.hpp>
 #include <stdexcept>
 
-#include "ck_bool_switch.h"
+#include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+    BatchedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+    BatchedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+    BatchedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+    BatchedForwardParams& param, hipStream_t stream);
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream)
 {
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
         if(param.custom_mask_type == 0)
-            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
+            run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 1)
-            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
+            run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 2)
-            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
+            run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else
             throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 873d6b093..3f3a61fb0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -8,45 +8,33 @@
 #include <ck/utility/data_type.hpp>
 #include <stdexcept>
 
-#include "ck_bool_switch.h"
+#include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
+    BatchedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
+    BatchedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
+    BatchedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
+    BatchedForwardParams& param, hipStream_t stream);
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream)
 {
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
         if(param.custom_mask_type == 0)
-            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
+            run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 1)
-            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
+            run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 2)
-            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
+            run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else
             throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index ff91b9fa6..edaf8a308 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -6,7 +6,7 @@
  */
 #pragma once
 
-#include <ck/tile_program/block_tile/block_masking_specialization.hpp>
+//#include <ck/tile_program/block_tile/block_masking_specialization.hpp>
 
 enum struct CausalMaskType
 {
@@ -15,6 +15,7 @@ enum struct CausalMaskType
     MaskUpperTriangleFromBottomRight
 };
 
+/*
 template <CausalMaskType type>
 struct CausalMaskPredicate;
 
@@ -35,3 +36,4 @@ struct CausalMaskPredicate<CausalMaskType::MaskUpperTriangleFromBottomRight>
 {
     using predicate = ck::tile_program::block::MaskUpperTriangleFromBottomRightPredicate;
 };
+*/
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index a36f3cb1c..94b36c235 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,9 +8,12 @@
 
 #include <type_traits>
 
-#include "ck/utility/common_header.hpp"
-#include "ck/tensor/tensor_view.hpp"
-#include "ck/tile_program/tile/tile_window.hpp"
+#include <ck/utility/common_header.hpp>
+#include <ck/tensor/tensor_view.hpp>
+#include <ck/tile_program/tile/tile_window.hpp>
+#include <ck/tile_program/block_tile/block_masking.hpp>
+
+#include "ck_tiled_fmha_definitions.h"
 
 // S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] * K[seqlen_k, hdim_q]
 // S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
@@ -18,10 +21,6 @@
 // P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
 // O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
 
-#ifndef C_LOG2E
-#define C_LOG2E 1.44269504088896340736 // log2(e)
-#endif
-
 template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdKernel
 {
@@ -43,60 +42,23 @@ struct FmhaFwdKernel
     static constexpr bool kM0NeedPadding   = FmhaPipeline::kM0NeedPadding;
     static constexpr bool kN0K1NeedPadding = FmhaPipeline::kN0K1NeedPadding;
     static constexpr bool kHasBias         = FmhaPipeline::kHasBias;
+    using FmhaMask                         = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    static constexpr bool kHasMask         = FmhaMask::IsMasking;
 
-    using C0MatrixMask = ck::tile_program::block::C0MatrixMask_impl<
-        ck::remove_cvref_t<typename FmhaPipeline::BlockFmhaMask>>;
+    // using C0MatrixMask = ck::tile_program::block::C0MatrixMask_impl<
+    //     ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>>;
 
     private:
+    template <ck::index_t I> // to avoid duplicated base class prblem, introduce an template arg
     struct EmptyKargs
     {
     };
 
+    // kargs use aggregate initializer, so no constructor will provided
+    // use inheritance to minimize karg size
+    // user need to use MakeKargs() function to create kargs.
     struct CommonKargs
     {
-        __host__ constexpr CommonKargs(const void* q_ptr_,
-                                       const void* k_ptr_,
-                                       const void* v_ptr_,
-                                       void* o_ptr_,
-                                       ck::index_t seqlen_q_,
-                                       ck::index_t seqlen_k_,
-                                       ck::index_t hdim_q_,
-                                       ck::index_t hdim_v_,
-                                       ck::index_t nhead_ratio_qk_,
-                                       float scale_,
-                                       ck::index_t stride_q_,
-                                       ck::index_t stride_k_,
-                                       ck::index_t stride_v_,
-                                       ck::index_t stride_o_,
-                                       ck::index_t nhead_stride_q_,
-                                       ck::index_t nhead_stride_k_,
-                                       ck::index_t nhead_stride_v_,
-                                       ck::index_t nhead_stride_o_)
-            : q_ptr{reinterpret_cast<const QDataType*>(q_ptr_)},
-              k_ptr{reinterpret_cast<const KDataType*>(k_ptr_)},
-              v_ptr{reinterpret_cast<const VDataType*>(v_ptr_)},
-              o_ptr{reinterpret_cast<ODataType*>(o_ptr_)},
-              seqlen_q{seqlen_q_},
-              seqlen_k{seqlen_k_},
-              hdim_q{hdim_q_},
-              hdim_v{hdim_v_},
-              nhead_ratio_qk{nhead_ratio_qk_},
-#if CK_FMHA_FWD_FAST_EXP2
-              scale{static_cast<float>(scale_ * C_LOG2E)},
-#else
-              scale{scale_},
-#endif
-              stride_q{stride_q_},
-              stride_k{stride_k_},
-              stride_v{stride_v_},
-              stride_o{stride_o_},
-              nhead_stride_q{nhead_stride_q_},
-              nhead_stride_k{nhead_stride_k_},
-              nhead_stride_v{nhead_stride_v_},
-              nhead_stride_o{nhead_stride_o_}
-        {
-        }
-
         const QDataType* q_ptr;
         const KDataType* k_ptr;
         const VDataType* v_ptr;
@@ -135,107 +97,26 @@ struct FmhaFwdKernel
         ck::index_t batch_stride_bias = 0;
     };
 
-    struct BatchModeKargs : CommonKargs,
-                            std::conditional_t<kHasBias, BatchModeBiasKargs, EmptyKargs>
+    struct MaskKargs
     {
-        __host__ constexpr BatchModeKargs(const void* q_ptr_,
-                                          const void* k_ptr_,
-                                          const void* v_ptr_,
-                                          void* o_ptr_,
-                                          ck::index_t seqlen_q_,
-                                          ck::index_t seqlen_k_,
-                                          ck::index_t hdim_q_,
-                                          ck::index_t hdim_v_,
-                                          ck::index_t nhead_ratio_qk_,
-                                          float scale_,
-                                          ck::index_t stride_q_,
-                                          ck::index_t stride_k_,
-                                          ck::index_t stride_v_,
-                                          ck::index_t stride_o_,
-                                          ck::index_t nhead_stride_q_,
-                                          ck::index_t nhead_stride_k_,
-                                          ck::index_t nhead_stride_v_,
-                                          ck::index_t nhead_stride_o_,
-                                          ck::index_t batch_stride_q_,
-                                          ck::index_t batch_stride_k_,
-                                          ck::index_t batch_stride_v_,
-                                          ck::index_t batch_stride_o_)
-            : CommonKargs{q_ptr_,
-                          k_ptr_,
-                          v_ptr_,
-                          o_ptr_,
-                          seqlen_q_,
-                          seqlen_k_,
-                          hdim_q_,
-                          hdim_v_,
-                          nhead_ratio_qk_,
-                          scale_,
-                          stride_q_,
-                          stride_k_,
-                          stride_v_,
-                          stride_o_,
-                          nhead_stride_q_,
-                          nhead_stride_k_,
-                          nhead_stride_v_,
-                          nhead_stride_o_},
-              batch_stride_q{batch_stride_q_},
-              batch_stride_k{batch_stride_k_},
-              batch_stride_v{batch_stride_v_},
-              batch_stride_o{batch_stride_o_}
-        {
-        }
+        CausalMaskType mask_type;
+        ck::index_t window_size;
+    };
 
+    struct BatchModeKargs : CommonKargs,
+                            std::conditional_t<kHasBias, BatchModeBiasKargs, EmptyKargs<0>>,
+                            std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>
+    {
         ck::index_t batch_stride_q;
         ck::index_t batch_stride_k;
         ck::index_t batch_stride_v;
         ck::index_t batch_stride_o;
     };
 
-    struct GroupModeKargs : CommonKargs, std::conditional_t<kHasBias, CommonBiasKargs, EmptyKargs>
+    struct GroupModeKargs : CommonKargs,
+                            std::conditional_t<kHasBias, CommonBiasKargs, EmptyKargs<0>>,
+                            std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>
     {
-        __host__ constexpr GroupModeKargs(const void* q_ptr_,
-                                          const void* k_ptr_,
-                                          const void* v_ptr_,
-                                          void* o_ptr_,
-                                          const void* seqstart_q_ptr_,
-                                          const void* seqstart_k_ptr_,
-                                          const void* seqlen_k_ptr_,
-                                          ck::index_t hdim_q_,
-                                          ck::index_t hdim_v_,
-                                          ck::index_t nhead_ratio_qk_,
-                                          float scale_,
-                                          ck::index_t stride_q_,
-                                          ck::index_t stride_k_,
-                                          ck::index_t stride_v_,
-                                          ck::index_t stride_o_,
-                                          ck::index_t nhead_stride_q_,
-                                          ck::index_t nhead_stride_k_,
-                                          ck::index_t nhead_stride_v_,
-                                          ck::index_t nhead_stride_o_)
-            : CommonKargs{q_ptr_,
-                          k_ptr_,
-                          v_ptr_,
-                          o_ptr_,
-                          -1 /* will be updated inside the kernel */,
-                          -1 /* will be updated inside the kernel */,
-                          hdim_q_,
-                          hdim_v_,
-                          nhead_ratio_qk_,
-                          scale_,
-                          stride_q_,
-                          stride_k_,
-                          stride_v_,
-                          stride_o_,
-                          nhead_stride_q_,
-                          nhead_stride_k_,
-                          nhead_stride_v_,
-                          nhead_stride_o_},
-              seqstart_q_ptr{reinterpret_cast<const int32_t*>(seqstart_q_ptr_)},
-              seqstart_k_ptr{reinterpret_cast<const int32_t*>(seqstart_k_ptr_)},
-              seqlen_k_ptr{reinterpret_cast<const int32_t*>(seqlen_k_ptr_)}
-        {
-        }
-
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
         const int32_t* seqlen_k_ptr;
@@ -270,13 +151,38 @@ struct FmhaFwdKernel
                                                                       ck::index_t batch_stride_k,
                                                                       ck::index_t batch_stride_v,
                                                                       ck::index_t batch_stride_bias,
-                                                                      ck::index_t batch_stride_o)
+                                                                      ck::index_t batch_stride_o,
+                                                                      CausalMaskType mask_type,
+                                                                      ck::index_t window_size)
     {
-        Kargs kargs{q_ptr,          k_ptr,          v_ptr,          o_ptr,          seqlen_q,
-                    seqlen_k,       hdim_q,         hdim_v,         nhead_ratio_qk, scale,
-                    stride_q,       stride_k,       stride_v,       stride_o,       nhead_stride_q,
-                    nhead_stride_k, nhead_stride_v, nhead_stride_o, batch_stride_q, batch_stride_k,
-                    batch_stride_v, batch_stride_o};
+        Kargs kargs{{reinterpret_cast<const QDataType*>(q_ptr),
+                     reinterpret_cast<const KDataType*>(k_ptr),
+                     reinterpret_cast<const VDataType*>(v_ptr),
+                     reinterpret_cast<ODataType*>(o_ptr),
+                     seqlen_q,
+                     seqlen_k,
+                     hdim_q,
+                     hdim_v,
+                     nhead_ratio_qk,
+#if CK_FMHA_FWD_FAST_EXP2
+                     static_cast<float>(scale * ck::math::log2e_v<>),
+#else
+                     scale,
+#endif
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o}, // args for common karg
+                    {},               // placeholder for bias
+                    {},               // placeholder for mask
+                    batch_stride_q,
+                    batch_stride_k,
+                    batch_stride_v,
+                    batch_stride_o};
 
         if constexpr(kHasBias)
         {
@@ -286,6 +192,12 @@ struct FmhaFwdKernel
             kargs.batch_stride_bias = batch_stride_bias;
         }
 
+        if constexpr(kHasMask)
+        {
+            kargs.mask_type   = mask_type;
+            kargs.window_size = window_size;
+        }
+
         return kargs;
     }
 
@@ -311,27 +223,37 @@ struct FmhaFwdKernel
                                                                       ck::index_t nhead_stride_k,
                                                                       ck::index_t nhead_stride_v,
                                                                       ck::index_t nhead_stride_bias,
-                                                                      ck::index_t nhead_stride_o)
+                                                                      ck::index_t nhead_stride_o,
+                                                                      CausalMaskType mask_type,
+                                                                      ck::index_t window_size)
     {
-        Kargs kargs{q_ptr,
-                    k_ptr,
-                    v_ptr,
-                    o_ptr,
-                    seqstart_q_ptr,
-                    seqstart_k_ptr,
-                    seqlen_k_ptr,
-                    hdim_q,
-                    hdim_v,
-                    nhead_ratio_qk,
-                    scale,
-                    stride_q,
-                    stride_k,
-                    stride_v,
-                    stride_o,
-                    nhead_stride_q,
-                    nhead_stride_k,
-                    nhead_stride_v,
-                    nhead_stride_o};
+        Kargs kargs{{reinterpret_cast<const QDataType*>(q_ptr),
+                     reinterpret_cast<const KDataType*>(k_ptr),
+                     reinterpret_cast<const VDataType*>(v_ptr),
+                     reinterpret_cast<ODataType*>(o_ptr),
+                     -1, // seqlen will be updated by another pointer
+                     -1, //
+                     hdim_q,
+                     hdim_v,
+                     nhead_ratio_qk,
+#if CK_FMHA_FWD_FAST_EXP2
+                     static_cast<float>(scale * ck::math::log2e_v<>),
+#else
+                     scale,
+#endif
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o}, // args for common karg
+                    {},               // placeholder for bias
+                    {},               // placeholder for mask
+                    reinterpret_cast<const int32_t*>(seqstart_q_ptr),
+                    reinterpret_cast<const int32_t*>(seqstart_k_ptr),
+                    reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
 
         if constexpr(kHasBias)
         {
@@ -339,6 +261,11 @@ struct FmhaFwdKernel
             kargs.stride_bias       = stride_bias;
             kargs.nhead_stride_bias = nhead_stride_bias;
         }
+        if constexpr(kHasMask)
+        {
+            kargs.mask_type   = mask_type;
+            kargs.window_size = window_size;
+        }
 
         return kargs;
     }
@@ -585,17 +512,73 @@ struct FmhaFwdKernel
             }
         }();
 
-        C0MatrixMask casual_mask{kargs.seqlen_q, kargs.seqlen_k};
+        FmhaMask mask = [&]() {
+            if constexpr(kHasMask)
+            {
+                auto res =
+                    ck::make_tuple(ck::index_t{0}, ck::index_t{0}, ck::index_t{0}, ck::index_t{0});
+
+                if(kargs.window_size > 0)
+                {
+                    if(kargs.mask_type == CausalMaskType::MaskDisabled)
+                    {
+                        ck::index_t lr_size = kargs.window_size / 2;
+
+                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                            lr_size, lr_size, kargs.seqlen_q, kargs.seqlen_k);
+                    }
+                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft)
+                    {
+                        ck::index_t lr_size = kargs.window_size / 2;
+
+                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                            lr_size, 0, kargs.seqlen_q, kargs.seqlen_k, true);
+                    }
+                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromBottomRight)
+                    {
+                        ck::index_t lr_size = kargs.window_size / 2;
+
+                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                            lr_size, 0, kargs.seqlen_q, kargs.seqlen_k, false);
+                    }
+                }
+                else
+                {
+                    if(kargs.mask_type == CausalMaskType::MaskDisabled)
+                    {
+                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                            -1, -1, kargs.seqlen_q, kargs.seqlen_k);
+                    }
+                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft)
+                    {
+                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                            -1, 0, kargs.seqlen_q, kargs.seqlen_k, true);
+                    }
+                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromBottomRight)
+                    {
+                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                            -1, 0, kargs.seqlen_q, kargs.seqlen_k, false);
+                    }
+                }
+
+                auto y = res.At(ck::Number<0>{});
+                auto x = res.At(ck::Number<1>{});
+
+                return FmhaMask{y, x, kargs.seqlen_q, kargs.seqlen_k};
+            }
+            else
+                return FmhaMask{0, 0, kargs.seqlen_q, kargs.seqlen_k};
+        }();
 
         auto o_acc_tile =
             FmhaPipeline{}(q_dram_window,
                            k_dram_window,
                            v_dram_window,
                            bias_dram_window,
-                           casual_mask,
+                           mask,
                            kargs.scale,
-                           ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
-                           ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
+                           // ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
+                           // ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
                            smem_ptr);
 
         // O DRAM and O DRAM window
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 20bc13130..5a026dbc9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -25,6 +25,7 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck/tile_program/block_tile/block_masking.hpp>
 
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
@@ -32,8 +33,10 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_fmha_definitions.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_infer_masktype_attnbias_dispatched
+#include "ck_tiled_bool_switch.h"
+
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+struct grouped_infer_causalmask_attnbias_dispatched
 {
     using QDataType           = scalar_t;
     using KDataType           = scalar_t;
@@ -47,9 +50,6 @@ struct grouped_infer_masktype_attnbias_dispatched
 
     using VLayout = ck::tensor_layout::gemm::RowMajor;
 
-    static constexpr auto masktype = static_cast<CausalMaskType>(custom_mask_type);
-    using FmhaCausalMask           = typename CausalMaskPredicate<masktype>::predicate;
-
     using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
     using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
     using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
@@ -96,31 +96,40 @@ struct grouped_infer_masktype_attnbias_dispatched
 
     static void Run(GroupedForwardParams& param, hipStream_t stream)
     {
-        GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-            using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-            using FmhaTraits          = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
-            using FmhaPipelineProblem =
-                ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                                  KDataType,
-                                                                  VDataType,
-                                                                  SaccDataType,
-                                                                  SMPLComputeDataType,
-                                                                  BiasDataType,
-                                                                  PDataType,
-                                                                  OaccDataType,
-                                                                  ODataType,
-                                                                  256, // BlockSize
-                                                                  FmhaShape,
-                                                                  true, // IsGroupMode
-                                                                  FmhaCausalMask,
-                                                                  FmhaTraits>;
-
-            using FmhaPipeline =
-                ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-
-            using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-            RunWithKernel<FmhaKernel>(param, stream);
+        const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+            using FmhaMask =
+                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+            GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+                using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
+                using FmhaPipelineProblem =
+                    ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
+                                                                      KDataType,
+                                                                      VDataType,
+                                                                      SaccDataType,
+                                                                      SMPLComputeDataType,
+                                                                      BiasDataType,
+                                                                      PDataType,
+                                                                      OaccDataType,
+                                                                      ODataType,
+                                                                      256, // BlockSize
+                                                                      FmhaShape,
+                                                                      true, // IsGroupMode
+                                                                      FmhaMask,
+                                                                      FmhaTraits>;
+
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+
+                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+            });
         });
     };
 
@@ -150,7 +159,9 @@ struct grouped_infer_masktype_attnbias_dispatched
                 param.k_strides[1],
                 param.v_strides[1],
                 param.attn_bias_strides[1],
-                param.out_strides[1]);
+                param.out_strides[1],
+                static_cast<CausalMaskType>(param.custom_mask_type),
+                param.window_size);
         }();
 
         dim3 kGridSize =
@@ -163,9 +174,10 @@ struct grouped_infer_masktype_attnbias_dispatched
     };
 };
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_infer_masktype_attnbias_dispatched(GroupedForwardParams& param, hipStream_t stream)
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+void run_grouped_infer_causalmask_attnbias_dispatched(GroupedForwardParams& param,
+                                                      hipStream_t stream)
 {
-    grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
+    grouped_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias>::Run(
         param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index b0c3318af..f942d1bbb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -8,45 +8,33 @@
 #include <ck/utility/data_type.hpp>
 #include <stdexcept>
 
-#include "ck_bool_switch.h"
+#include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+    GroupedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+    GroupedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+    GroupedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+    GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream)
 {
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
         if(param.custom_mask_type == 0)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
+            run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 1)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
+            run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 2)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
+            run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else
             throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index eda9a6462..288ad5f57 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -8,45 +8,33 @@
 #include <ck/utility/data_type.hpp>
 #include <stdexcept>
 
-#include "ck_bool_switch.h"
+#include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
+    GroupedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
+    GroupedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
+    GroupedForwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
+    GroupedForwardParams& param, hipStream_t stream);
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream)
 {
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
         if(param.custom_mask_type == 0)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
+            run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 1)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
+            run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else if(param.custom_mask_type == 2)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
+            run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
+                param, stream);
         else
             throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 0a988b6b2..11274c5c4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -35,6 +35,7 @@ struct BatchedInferParams
     const void* attn_bias_ptr;
 
     uint8_t custom_mask_type;
+    int window_size; // local-attention
 
     void* out_ptr;
 };
@@ -86,6 +87,7 @@ struct GroupedInferParams
     const void* attn_bias_ptr;
 
     uint8_t custom_mask_type;
+    int window_size; // local-attention
 
     void* out_ptr;
 };
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 23c8375db..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 893cf803a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index ce1adafad..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 3bf55fe50..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 861f63d35..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp
index f9d551e6e..4c06d77aa 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp
index 11ab6765f..407f20ab4 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp
index 22ba1cbf0..55100393d 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp
index e45b01c1c..36438844e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 5c9d5a113..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index a788c0e4b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index daa204ebd..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp
new file mode 100644
index 000000000..06957d596
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp
new file mode 100644
index 000000000..cae5a03c1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp
new file mode 100644
index 000000000..f5a42d733
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp
new file mode 100644
index 000000000..9f79c2ed5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index a5e5e5aa4..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index d2a0f9f30..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 176ff416d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index dc213019f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index a63206d4e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp
index 17da13db7..9a16d8160 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp
index e78118baf..9d5260deb 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp
index 537e59bd1..716a48b9c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp
similarity index 58%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp
index 9f9dd97f1..f79e7ee14 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp
@@ -8,6 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index e40ffafc3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 919c73a4a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index e5d08e589..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp
new file mode 100644
index 000000000..8a68b03d6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp
new file mode 100644
index 000000000..9fb627dc1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp
new file mode 100644
index 000000000..dff263668
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp
new file mode 100644
index 000000000..86cc2f3eb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 143c74f79..a6cd87c6b 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -181,6 +181,7 @@ def apply(
             seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
+            window_size=0,
         )
         ctx: Optional[Context] = None
         if needs_gradient:

From ba5fd52b9cb22e22c0cd9c2fd5e682a4bb6433d1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 5 Jan 2024 17:33:48 +0000
Subject: [PATCH 319/641] Add ck.FwOp and ck.BwOp to dispatched operations

---
 xformers/ops/fmha/dispatch.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 30d6ec615..c9708770b 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -66,14 +66,20 @@ def _run_priority_list(name: str, priority_list: Sequence[T], inp: Inputs) -> T:
 def _dispatch_fw_priority_list(
     inp: Inputs, needs_gradient: bool
 ) -> Sequence[Type[AttentionFwOpBase]]:
-    priority_list_ops = deque(
-        [
-            flash.FwOp,
-            triton.FwOp,
-            cutlass.FwOp,
-            small_k.FwOp,
-        ]
-    )
+    if torch.version.cuda:
+        priority_list_ops = deque(
+           [
+              flash.FwOp,
+              triton.FwOp,
+              cutlass.FwOp,
+              small_k.FwOp,
+           ])
+    else:
+        priority_list_ops = deque(
+           [
+              triton.FwOp,
+              ck.FwOp,
+           ])
     if _is_cutlass_fwd_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.FwOp)
         priority_list_ops.appendleft(cutlass.FwOp)

From 6533aca6517e3e9fdafbd9e0167166dd722f1510 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 5 Jan 2024 17:35:00 +0000
Subject: [PATCH 320/641] Add ck.FwOp and ck.BwOp to ALL_FW_OPS and ALL_BW_OPS

---
 xformers/ops/fmha/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 3a0f3646b..289e8f6e3 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -416,7 +416,7 @@ def _memory_efficient_attention_backward(
 
 
 ALL_FW_OPS: Sequence[Type[AttentionFwOpBase]] = [
-    cutlass.FwOp,
+    cutlass.FwOp if torch.version.cuda else ck.FwOp,
     flash.FwOp,
     triton.FwOp,
     small_k.FwOp,
@@ -424,7 +424,7 @@ def _memory_efficient_attention_backward(
 ]
 
 ALL_BW_OPS: Sequence[Type[AttentionBwOpBase]] = [
-    cutlass.BwOp,
+    cutlass.BwOp if torch.version.cuda else ck.BwOp,
     flash.BwOp,
     small_k.BwOp,
 ]

From 7fc362068c9624172c16ac88d92dbae77487f7ea Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 5 Jan 2024 17:37:45 +0000
Subject: [PATCH 321/641] Update in tests/readme_test_on_rocm.txt

---
 tests/readme_test_on_rocm.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index b2b18ff78..129bf3df0 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -4,6 +4,7 @@
    2. verify testing for memory_efficient_attention inference 
 
       pytest tests/test_mem_eff_attention_ck.py::test_forward
+      pytest tests/test_mem_eff_attention.py::test_forward -k ckF 
 
    3. The following tests in tests/memory_eff_attention_ck.py have passed
 

From 23e191ad508aa599d76f25331b10e01198f6ed64 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 5 Jan 2024 17:53:57 +0000
Subject: [PATCH 322/641] Add ckF and ck_decoder to
 benchmark_mem_eff_attn_decoder.py

---
 xformers/benchmarks/benchmark_mem_eff_attn_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
index 7f1b4ceaa..9fa58e7dd 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
@@ -59,8 +59,8 @@ def T(t):
 NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
 
 OPS = [
-    xformers.ops.fmha.cutlass.FwOp,
-    xformers.ops.fmha.decoder.FwOp,
+    xformers.ops.fmha.cutlass.FwOp if torch.version.cuda else xformers.ops.fmha.ck.FwOp,
+    xformers.ops.fmha.decoder.FwOp if torch.version.cuda else xformers.ops.fmha.ck_decoder.FwOp,
 ]
 
 KV_SHAPES = [

From 45287b73b15b565a786febbc8b092e15204bb018 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 8 Jan 2024 21:51:55 +0000
Subject: [PATCH 323/641] Synchronize with the latest ck-tiled commits

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index afea7392d..539f9677e 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit afea7392d59cbd71247336483f5cf190c0929866
+Subproject commit 539f9677e047da576f67810f7833dd983df3c1f8

From 1a746751dd43ed25d1a0926eb9067f7a76b976ef Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 8 Jan 2024 23:46:22 +0000
Subject: [PATCH 324/641] Add is_ck_tiled_used() c++ extension interface for
 judging if ck-tiled is used

---
 xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
index 6c7de39ef..571b206fa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
@@ -17,10 +17,23 @@ bool is_ck_fmha_available(double val)
     return (true);
 };
 
+// For checking if ck-tiled kernel is used
+bool is_ck_tiled_used()
+{
+#if defined(USE_CK_TILED_KERNEL)
+    return (true);
+#else
+    return (false);
+#endif
+};
+
 } // namespace
 
 TORCH_LIBRARY_FRAGMENT(xformers, m)
 {
     m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available(float val) -> bool"));
     m.impl(TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"), TORCH_FN(is_ck_fmha_available));
+
+    m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_tiled_used() -> bool"));
+    m.impl(TORCH_SELECTIVE_NAME("xformers::is_ck_tiled_used"), TORCH_FN(is_ck_tiled_used));
 }

From cbcc1964c6d2f1ed9f3afe94e373f5f6c66eb28b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 9 Jan 2024 00:12:03 +0000
Subject: [PATCH 325/641] Remove composable_kernel_tiled submodule

---
 .gitmodules                         | 4 ----
 third_party/composable_kernel_tiled | 1 -
 2 files changed, 5 deletions(-)
 delete mode 160000 third_party/composable_kernel_tiled

diff --git a/.gitmodules b/.gitmodules
index acbe24ecc..3017b3887 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,7 +8,3 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
-[submodule "third_party/composable_kernel_tiled"]
-	path = third_party/composable_kernel_tiled
-	url = https://github.com/asroy/ck_tile
-        branch = fmha_attemp_async_copy_unify 
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
deleted file mode 160000
index 539f9677e..000000000
--- a/third_party/composable_kernel_tiled
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 539f9677e047da576f67810f7833dd983df3c1f8

From b4539f71c515a4a8941920485a39c09df1993bcf Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:25:38 +0000
Subject: [PATCH 326/641] inner_product removed from splitk kernel code

---
 .../ck_attention_forward_decoder_splitk.h     | 44 -------------------
 1 file changed, 44 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 29f330b29..49b95e4a4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -7,50 +7,6 @@
 #include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
 
-namespace ck {
-template <>
-__device__ void inner_product<bhalf_t, bhalf_t, float>(
-    const bhalf_t& a,
-    const bhalf_t& b,
-    float& c) {
-  inner_product(type_convert<float>(a), type_convert<float>(b), c);
-}
-
-template <>
-
-__device__ void inner_product<half_t, half_t, float>(
-    const half_t& a,
-    const half_t& b,
-    float& c) {
-  inner_product(type_convert<float>(a), type_convert<float>(b), c);
-}
-
-template <>
-__device__ void inner_product<bhalf2_t, bhalf2_t, float>(
-    const bhalf2_t& a,
-    const bhalf2_t& b,
-    float& c) {
-  const vector_type<bhalf_t, 2> a_vector{a};
-  const vector_type<bhalf_t, 2> b_vector{b};
-  ck::static_for<0, 2, 1>{}([&](auto i) {
-    inner_product(
-        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-  });
-}
-
-template <>
-__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
-    const bhalf4_t& a,
-    const bhalf4_t& b,
-    float& c) {
-  const vector_type<bhalf_t, 4> a_vector{a};
-  const vector_type<bhalf_t, 4> b_vector{b};
-  ck::static_for<0, 4, 1>{}([&](auto i) {
-    inner_product(
-        a_vector.AsType<bhalf_t>()[i], b_vector.AsType<bhalf_t>()[i], c);
-  });
-}
-} // namespace ck
 
 namespace {
 

From 9c52e0edd0ba2eb186e60ce6dfa43f8c86ff353b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:35:58 +0000
Subject: [PATCH 327/641] remove some commented out debug code

---
 tests/test_mem_eff_attention_ck.py | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 905226af3..77dbde6d2 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -401,10 +401,6 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
     # reduce out over split-k slices
 
-    # return slices[0]["row_max"].repeat_interleave(256, -1)
-    # return slices[0]["row_lse"].repeat_interleave(256, -1)
-    # return slices[0]["attn_slice"]
-
     m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
     l_current_sum = torch.zeros_like(slices[0]["row_lse"])
 
@@ -1755,14 +1751,10 @@ def test_splitk_reference(
 
 
 @pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
-# @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-# @pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
-# @pytest.mark.parametrize("padding", [32, 4096])
-# @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
-@pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("n_heads", [16])
-@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
+@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
+@pytest.mark.parametrize("padding", [32, 4096])
+@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
 def test_decoder(
     op,
     n_heads: int,
@@ -1816,19 +1808,10 @@ def test_decoder(
     if (not_supported_reasons := op.not_supported_reasons(inp)):
         pytest.skip(f"{not_supported_reasons=}")
 
-    ref_output = ref_attention_splitk(q, k, v, attn_bias, dtype=dtype_, split_k=1)
-
-    print(f"{ref_output.shape=}")
-
     decoder_output = fmha.memory_efficient_attention_forward(
         q, k, v, attn_bias, op=op
     )
 
-    # attn_bias_tensor = attn_bias.materialize(shape=(q.shape[0], 1, q.shape[1], k.shape[1]), device=q.device, dtype=dtype_)
-    # print(f"{k_seqlen=}")
-    # torch.set_printoptions(threshold=None, edgeitems=256)
-    # print(f"{attn_bias_tensor.shape=} {attn_bias_tensor=}")
-
     ref_output = ref_attention(q, k, v, attn_bias)
 
     assert_allclose(
@@ -1844,10 +1827,6 @@ def test_decoder(
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
 @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
-# @pytest.mark.parametrize("dtype", ["f16"])
-# @pytest.mark.parametrize("kv_heads", [None], ids=_kv_heads_label)
-# @pytest.mark.parametrize("n_heads", [16])
-# @pytest.mark.parametrize("padding, bsz", [(32, 8),])
 def test_splitk_decoder(
     op,
     kv_heads: Optional[int],

From 0a1aa5d0030e79ba5c48782e7babed9723f7bfe5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:41:45 +0000
Subject: [PATCH 328/641] comment out debug code calling libtorch instead of
 hip implementation

---
 xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 79ef348d8..2d6db0284 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -300,7 +300,7 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck(
     double qk_scale,
     int64_t split_k) {
 
-  return efficient_attention_forward_decoder_split1_torch(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
+  // return efficient_attention_forward_decoder_split1_torch(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
 
   return efficient_attention_forward_decoder_splitk_ck_impl<
       kThreadsPerWavefront,

From 153d7229718e51c17f99ffbec3a00190e33140a8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:43:51 +0000
Subject: [PATCH 329/641] remove commented out old and incorrect code fragments

---
 .../hip_fmha/attention_forward_splitk.cpp     | 80 -------------------
 1 file changed, 80 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 2d6db0284..3fb42ecca 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -91,55 +91,6 @@ struct c10_to_data_t<c10::BFloat16> {
 
 namespace {
 
-// at::Tensor efficient_attention_forward_decoder_splitk_ck(
-//     const at::Tensor& XQ, // [B, 1, G, H, D]
-//     const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-//     const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
-//     at::optional<at::Tensor> seq_kv_lens, // [B]
-//     double qk_scale,
-//     at::Tensor& O,
-//     int64_t split_k) {
-    
-//     at::OptionalDeviceGuard guard(XQ.device());
-
-//     TORCH_CHECK(XQ.is_cuda());
-//     TORCH_CHECK(cache_K.is_cuda());
-//     TORCH_CHECK(cache_V.is_cuda());
-
-//     TORCH_CHECK(seq_positions.is_cuda());
-
-//     auto M = XQ.size(1);
-//     auto B = XQ.size(0);
-//     auto G = XQ.size(2);
-//     auto H = XQ.size(3);
-//     auto K_q = XQ.size(4);
-//     auto M_k = cache_K.size(1);
-
-//     constexpr auto BLOCK_M = 16;
-//     auto M_ceil = (M + BLOCK_M - 1) / BLOCK_M * BLOCK_M;
-
-//     constexpr auto kThreadsPerWarp = 64;
-//     constexpr auto kWarpsPerBlock = 2; // original uses 2 warps
-
-//     const auto options = at::TensorOptions()
-//                              .dtype(XQ.dtype())
-//                              .layout(at::kStrided)
-//                              .device(XQ.device())
-//                              .requires_grad(false);
-
-//     auto O_splitk = at::empty({B * G * H, split_k, M_ceil, K_q}, options);
-//     auto metadata = at::empty({B * G * H, 2, split_k, M_ceil}, options);
-
-//     dim3 attention_grid = {static_cast<uint32_t>(M / BLOCK_M), static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(split_k)};
-//     dim3 reduce_grid = {static_cast<uint32_t>(B * G * H), static_cast<uint32_t>(M)};
-
-//     dim3 threads = {kThreadsPerWarp * kWarpsPerBlock};
-    
-//     auto O = at::empty_like(XQ);
-
-//     return O;
-// }
-
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock,
           int32_t KV_M_MAX = 8192,
           int32_t K_MAX = 256>
@@ -348,37 +299,6 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 
 // clang-format on
 
-// static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
-//   const at::Tensor& Q,
-//   const at::Tensor& K,
-//   const at::Tensor& V,
-//   const at::Tensor& k_seqlens
-// ) {
-//   auto Q_scaled = Q / sqrt(Q.size(-1));
-//   auto S = at::einsum("bmghk, bnghk -> bmghn", {Q_scaled, K}, at::nullopt);
-
-//   auto m = std::get<0>(at::max(S, /* dim */ 1, /* keepdim */ true));
-//   auto s = at::exp(at::sub(S, m));
-  
-//   // causal mask
-//   for (size_t b = 0; b < k_seqlens.numel(); ++b) {
-//     auto seqlen = k_seqlens[b].item<int64_t>();
-//     at::slice(s[b], /* dim */ -1, /* start */ seqlen, /* end */ -1).zero_();
-//   }
-
-//   auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-//   auto O = at::einsum("bmghn, bnghk -> bmghk", {s, V}, at::nullopt);
-//   return std::make_tuple(O, m, l);
-// }
-
-// static at::Tensor split1_reduce_torch(
-//   const at::Tensor& O_splits,
-//   const at::Tensor& m,
-//   const at::Tensor& l
-// ) {
-//   return at::div(O_splits[0], l);
-// }
-
 namespace ck {
 namespace tensor_operation {
 namespace device {

From eea5fef57ee995b8e6a369fafabddaebf30dcdfb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:53:46 +0000
Subject: [PATCH 330/641] add python version override to cmakelists

---
 xformers/csrc/attention/hip_fmha/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index ee208bffe..2bf65f305 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -11,6 +11,8 @@ set(CMAKE_CXX_FLAGS "-Wall")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
 set(CMAKE_VERBOSE_MAKEFILE on)
 
+set(py_version 3.9)
+
 set(exe_name attention_forward_decoder_main)
 set(splitk_exe_name attention_forward_splitk_decoder_main)
 set(project_root_dir /xformers)
@@ -18,7 +20,7 @@ set(xformers_csrc ${project_root_dir}/xformers/csrc)
 set(sources ${xformers_csrc}/attention/hip_fmha/attention_forward_decoder.hip)
 set(splitk_sources ${xformers_csrc}/attention/hip_fmha/attention_forward_splitk.hip)
 set(ck_include ${project_root_dir}/third_party/composable_kernel/include/)
-set(torch_include /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/include)
+set(torch_include /opt/conda/envs/py_${py_version}/lib/python${py_version}/site-packages/torch/include)
 
 set_source_files_properties(${sources} ${splitk_sources} PROPERTIES LANGUAGE HIP)
 add_executable(${exe_name} ${sources})
@@ -63,12 +65,12 @@ target_include_directories(${splitk_exe_name} PUBLIC
 )
 
 target_link_directories(${exe_name} PUBLIC
-  /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib # c10, torch
+  /opt/conda/envs/py_${py_version}/lib/python${py_version}/site-packages/torch/lib # c10, torch
   /opt/rocm/hip/lib
 )
 
 target_link_directories(${splitk_exe_name} PUBLIC
-  /opt/conda/envs/py_3.8/lib/python3.8/site-packages/torch/lib # c10, torch
+  /opt/conda/envs/py_${py_version}/lib/python${py_version}/site-packages/torch/lib # c10, torch
   /opt/rocm/hip/lib
 )
 

From d442fbebab0faf8f41cab0b8d1aeb779b95631c8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 4 Jan 2024 02:18:57 +0000
Subject: [PATCH 331/641] add conversion from Argument struct to string; fix
 split1 test crash

 -- fyi device guard needs to be declared to avoid segfaults in the kernel
---
 .../hip_fmha/attention_forward_splitk.cpp     | 98 +++++++++++++++----
 .../ck_attention_forward_decoder_splitk.h     | 39 ++++++++
 2 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 3fb42ecca..ff9e7953a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -302,6 +302,7 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
 namespace ck {
 namespace tensor_operation {
 namespace device {
+
 template <typename scalar_t, typename compute_t = float>
 struct FMHADecoderSplit1DeviceOp : public BaseOperator {
   using DeviceOp = FMHADecoderSplit1DeviceOp;
@@ -395,6 +396,42 @@ struct FMHADecoderSplit1DeviceOp : public BaseOperator {
           grid_dim(grid_dim),
           block_dim(block_dim),
           lds_bytes(lds_bytes) {}
+
+      std::string str() const {
+        std::ostringstream oss;
+        oss << "Argument { " << std::endl <<
+        "    XQ: " << XQ << std::endl <<
+        "    cache_K: " << cache_K << std::endl <<
+        "    cache_V: " << cache_V << std::endl << 
+        "    O: " << O << std::endl <<
+        "    split_O: " << split_O << std::endl <<
+        "    split_max: " << split_max << std::endl <<
+        "    split_sumexp: " << split_sumexp << std::endl <<
+        "    seq_kv_lens: " << seq_kv_lens << std::endl <<
+        "    XQ_stride_b: " << XQ_stride_b << std::endl <<
+        "    XQ_stride_m: " << XQ_stride_m << std::endl <<
+        "    XQ_stride_g: " << XQ_stride_g << std::endl <<
+        "    XQ_stride_h: " << XQ_stride_h << std::endl <<
+        "    K_stride_b: " << K_stride_b << std::endl <<
+        "    K_stride_m: " << K_stride_m << std::endl <<
+        "    K_stride_g: " << K_stride_g << std::endl <<
+        "    K_stride_h: " << K_stride_h << std::endl <<
+        "    O_stride_split: " << O_stride_split << std::endl <<
+        "    Q_size_m: " << Q_size_m << std::endl <<
+        "    Q_size_g: " << Q_size_g << std::endl <<
+        "    Q_size_h: " << Q_size_h << std::endl <<
+        "    Q_size_k: " << Q_size_k << std::endl <<
+        "    K_size_m: " << K_size_m << std::endl <<
+        "    multiquery: " << multiquery << std::endl <<
+        "    qk_scale: " << qk_scale << std::endl <<
+        "    split_k: " << split_k << std::endl <<
+        std::endl <<
+        "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z << std::endl <<
+        "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z << std::endl <<
+        "    lds_bytes: " << lds_bytes << std::endl <<
+        "}";
+        return oss.str();
+      }
   };
 
   struct Invoker : public BaseInvoker {
@@ -402,6 +439,9 @@ struct FMHADecoderSplit1DeviceOp : public BaseOperator {
     float Run(
         const Argument& arg,
         const StreamConfig& stream_config = StreamConfig{}) {
+
+      // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ << std::endl;
+
       auto threads_per_wavefront = arg.block_dim.x;
 
       auto Q_size_k_alignment_necessary = 0;
@@ -623,6 +663,9 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(
   const at::Tensor& K, 
   const at::Tensor& V, 
   const at::Tensor& seqlen) {
+
+  at::OptionalDeviceGuard guard(XQ.device());
+
   auto B = XQ.size(0);
   auto M = XQ.size(1);
   auto G = XQ.size(2);
@@ -732,23 +775,13 @@ static void test_split1_attention() {
   auto V = at::randn_like(K);
   auto seqlen = at::randint(1062, 1063, {B}, int_options);
   
-  // printf("Run libtorch split1_attention:\n");
-  // auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
+  auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
 
-  printf("Run hip split1_attention:\n");
   auto hip_result = split1_attention_hip(XQ, K, V, seqlen);
 
-  printf("Do comparison for split1_attention:\n");
-
-  // auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  // auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  // auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  // auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(reference_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  // auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(reference_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  // auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(reference_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto O_match_mask = at::isclose(std::get<0>(hip_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto m_match_mask = at::isclose(std::get<1>(hip_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto l_match_mask = at::isclose(std::get<2>(hip_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
 
   auto O_percent_match = at::sum(O_match_mask.to(torch::kFloat32)) / O_match_mask.numel();
   auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
@@ -768,25 +801,48 @@ static void test_split1_attention() {
 }
 
 static void do_correctness_check() {
+  // const int32_t D = 4 * kThreadsPerWavefront;
+  // const int32_t B = 1;
+  // const int32_t H = 16;
+  // const int32_t G = 2;
+  // const int32_t padding = 4096;
+  // const int32_t num_queries = 1;
+  // auto options = torch::TensorOptions()
+  //                    .dtype(torch::kFloat32)
+  //                    .layout(torch::kStrided)
+  //                    .device(torch::kCUDA, 1)
+  //                    .requires_grad(false);
+  // auto int_options = options.dtype(torch::kInt);
+  // auto XQ = at::randn({B, num_queries, G, H, D}, options);
+  // auto K = at::randn({B, padding, G, H, D}, options);
+  // auto V = at::randn({B, padding, G, H, D}, options);
+  // auto seqlen = at::randint(1062, 1063, {B}, int_options);
+  // double qk_scale = 1. / sqrt(D);
+  // constexpr auto split_k = 1;
+
   const int32_t D = 4 * kThreadsPerWavefront;
   const int32_t B = 1;
-  const int32_t H = 16;
-  const int32_t G = 2;
+  const int32_t Hq = 16;
+  const int32_t Hkv = 16;
+  const int32_t G = Hq / Hkv;
   const int32_t padding = 4096;
   const int32_t num_queries = 1;
+  const auto scalar_type = torch::kFloat32;
   auto options = torch::TensorOptions()
-                     .dtype(torch::kFloat32)
+                     .dtype(scalar_type)
                      .layout(torch::kStrided)
                      .device(torch::kCUDA, 1)
                      .requires_grad(false);
   auto int_options = options.dtype(torch::kInt);
-  auto XQ = at::randn({B, num_queries, G, H, D}, options);
-  auto K = at::randn({B, padding, G, H, D}, options);
-  auto V = at::randn({B, padding, G, H, D}, options);
+  auto XQ = at::randn({B, num_queries, G, Hq, D}, options);
+  auto K = (G == 1) 
+    ? at::randn({B, padding, G, Hkv, D}, options) 
+    : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
+  auto V = at::randn_like(K);
   auto seqlen = at::randint(1062, 1063, {B}, int_options);
   double qk_scale = 1. / sqrt(D);
   constexpr auto split_k = 1;
-
+  
   auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
       XQ, K, V, seqlen, qk_scale, split_k);
   auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl<64, 16>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 49b95e4a4..d73da0cbc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -591,6 +591,42 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
           grid_dim(grid_dim),
           block_dim(block_dim),
           lds_bytes(lds_bytes) {}
+
+      std::string str() const {
+        std::ostringstream oss;
+        oss << "Argument { " << std::endl <<
+        "    XQ: " << XQ << std::endl <<
+        "    cache_K: " << cache_K << std::endl <<
+        "    cache_V: " << cache_V << std::endl << 
+        "    O: " << O << std::endl <<
+        "    split_O: " << split_O << std::endl <<
+        "    split_max: " << split_max << std::endl <<
+        "    split_sumexp: " << split_sumexp << std::endl <<
+        "    seq_kv_lens: " << seq_kv_lens << std::endl <<
+        "    XQ_stride_b: " << XQ_stride_b << std::endl <<
+        "    XQ_stride_m: " << XQ_stride_m << std::endl <<
+        "    XQ_stride_g: " << XQ_stride_g << std::endl <<
+        "    XQ_stride_h: " << XQ_stride_h << std::endl <<
+        "    K_stride_b: " << K_stride_b << std::endl <<
+        "    K_stride_m: " << K_stride_m << std::endl <<
+        "    K_stride_g: " << K_stride_g << std::endl <<
+        "    K_stride_h: " << K_stride_h << std::endl <<
+        "    O_stride_split: " << O_stride_split << std::endl <<
+        "    Q_size_m: " << Q_size_m << std::endl <<
+        "    Q_size_g: " << Q_size_g << std::endl <<
+        "    Q_size_h: " << Q_size_h << std::endl <<
+        "    Q_size_k: " << Q_size_k << std::endl <<
+        "    K_size_m: " << K_size_m << std::endl <<
+        "    multiquery: " << multiquery << std::endl <<
+        "    qk_scale: " << qk_scale << std::endl <<
+        "    split_k: " << split_k << std::endl <<
+        std::endl <<
+        "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z << std::endl <<
+        "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z << std::endl <<
+        "    lds_bytes: " << lds_bytes << std::endl <<
+        "}";
+        return oss.str();
+      }
   };
 
   struct Invoker : public BaseInvoker {
@@ -598,6 +634,9 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
     float Run(
         const Argument& arg,
         const StreamConfig& stream_config = StreamConfig{}) {
+
+      // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ << std::endl;
+
       auto threads_per_wavefront = arg.block_dim.x;
 
       auto Q_size_k_alignment_necessary = 0;

From 38c5e904b137dc18f54be912c5033f3afd075eb7 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 5 Jan 2024 22:34:59 +0000
Subject: [PATCH 332/641] add f32 support in the python op

---
 tests/test_mem_eff_attention_ck.py  | 7 ++++++-
 xformers/ops/fmha/forward_splitk.py | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 77dbde6d2..f03d9a979 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1814,6 +1814,11 @@ def test_decoder(
 
     ref_output = ref_attention(q, k, v, attn_bias)
 
+    # print(f"{torch.where(decoder_output.isnan())=}")
+    # print(f"{torch.sum(decoder_output.isnan())} nans out of {decoder_output.numel()}")
+    # print(f"{torch.sum(decoder_output.isinf())} infs out of {decoder_output.numel()}")
+    # print(f"{k_seqlen=}")
+
     assert_allclose(
         decoder_output.float(),
         ref_output,
@@ -1823,7 +1828,7 @@ def test_decoder(
 
 
 @pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp_S1, fmha.forward_splitk.FwOp_S2])
-@pytest.mark.parametrize("dtype", ["f16"])
+@pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
 @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/forward_splitk.py
index 0a0651fea..013c605a6 100644
--- a/xformers/ops/fmha/forward_splitk.py
+++ b/xformers/ops/fmha/forward_splitk.py
@@ -12,6 +12,7 @@ class FwOp(AttentionFwOpBase):
     SUPPORTED_DTYPES = {
         torch.half,
         torch.bfloat16,
+        torch.float
     }  # Those are dtypes of Q. In the quantized case K/V has dtype int32
     SUPPORTED_MAX_K = 256
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {

From b805813312bf4698eab1809779505f0fa985e24f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 5 Jan 2024 22:42:15 +0000
Subject: [PATCH 333/641] refactor out input generation in cpp standalone

---
 .../hip_fmha/attention_forward_splitk.cpp     | 104 +++++-------------
 1 file changed, 26 insertions(+), 78 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index ff9e7953a..bc73473d8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -753,17 +753,13 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(
   return std::make_tuple(split_O[splitk_dim], split_max, split_sumexp);
 }
 
-static void test_split1_attention() {
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> generate_inputs(const int32_t padding, const int32_t B, const int32_t Hq, const int32_t Hkv, const decltype(torch::kFloat32) dtype = torch::kFloat32) {
   const int32_t D = 4 * kThreadsPerWavefront;
-  const int32_t B = 1;
-  const int32_t Hq = 16;
-  const int32_t Hkv = 16;
   const int32_t G = Hq / Hkv;
-  const int32_t padding = 4096;
   const int32_t num_queries = 1;
-  const auto scalar_type = torch::kFloat32;
+
   auto options = torch::TensorOptions()
-                     .dtype(scalar_type)
+                     .dtype(dtype)
                      .layout(torch::kStrided)
                      .device(torch::kCUDA, 1)
                      .requires_grad(false);
@@ -774,6 +770,12 @@ static void test_split1_attention() {
     : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
   auto V = at::randn_like(K);
   auto seqlen = at::randint(1062, 1063, {B}, int_options);
+
+  return std::make_tuple(XQ, K, V, seqlen);
+}
+
+static void test_split1_attention() {
+  auto [XQ, K, V, seqlen] = generate_inputs(4096, 1, 16, 16);
   
   auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
 
@@ -801,46 +803,9 @@ static void test_split1_attention() {
 }
 
 static void do_correctness_check() {
-  // const int32_t D = 4 * kThreadsPerWavefront;
-  // const int32_t B = 1;
-  // const int32_t H = 16;
-  // const int32_t G = 2;
-  // const int32_t padding = 4096;
-  // const int32_t num_queries = 1;
-  // auto options = torch::TensorOptions()
-  //                    .dtype(torch::kFloat32)
-  //                    .layout(torch::kStrided)
-  //                    .device(torch::kCUDA, 1)
-  //                    .requires_grad(false);
-  // auto int_options = options.dtype(torch::kInt);
-  // auto XQ = at::randn({B, num_queries, G, H, D}, options);
-  // auto K = at::randn({B, padding, G, H, D}, options);
-  // auto V = at::randn({B, padding, G, H, D}, options);
-  // auto seqlen = at::randint(1062, 1063, {B}, int_options);
-  // double qk_scale = 1. / sqrt(D);
-  // constexpr auto split_k = 1;
+  auto [XQ, K, V, seqlen] = generate_inputs(4096, 1, 16, 16);
 
-  const int32_t D = 4 * kThreadsPerWavefront;
-  const int32_t B = 1;
-  const int32_t Hq = 16;
-  const int32_t Hkv = 16;
-  const int32_t G = Hq / Hkv;
-  const int32_t padding = 4096;
-  const int32_t num_queries = 1;
-  const auto scalar_type = torch::kFloat32;
-  auto options = torch::TensorOptions()
-                     .dtype(scalar_type)
-                     .layout(torch::kStrided)
-                     .device(torch::kCUDA, 1)
-                     .requires_grad(false);
-  auto int_options = options.dtype(torch::kInt);
-  auto XQ = at::randn({B, num_queries, G, Hq, D}, options);
-  auto K = (G == 1) 
-    ? at::randn({B, padding, G, Hkv, D}, options) 
-    : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
-  auto V = at::randn_like(K);
-  auto seqlen = at::randint(1062, 1063, {B}, int_options);
-  double qk_scale = 1. / sqrt(D);
+  double qk_scale = 1. / sqrt(XQ.size(-1));
   constexpr auto split_k = 1;
   
   auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
@@ -858,54 +823,37 @@ static void do_correctness_check() {
 
 int main(int argc, char** argv) {
   if (argc == 1) {
-    // do_correctness_check();
+    do_correctness_check();
 
-    test_split1_attention();
+    // test_split1_attention();
   } else {
     const auto args = std::vector<std::string>(argv + 1, argv + argc);
-    if (args.size() != 7) {
+    if (args.size() != 6) {
       std::cout
-          << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype n_wavefronts_per_block"
+          << "Usage: ./a.out padding batch_size nq_heads nkv_heads dtype n_wavefronts_per_block"
           << std::endl;
       return 0;
     }
-    const int32_t n_keys = std::stoi(args[0]);
-    const int32_t padding = std::stoi(args[1]);
-    const int32_t batch_size = std::stoi(args[2]);
-    const int32_t n_heads = std::stoi(args[3]);
-    const int32_t n_groups = 1;
-    const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32") ? torch::kFloat32
-        : (args[5] == "f16")              ? torch::kFloat16
+    const int32_t padding = std::stoi(args[0]);
+    const int32_t batch_size = std::stoi(args[1]);
+    const int32_t nq_heads = std::stoi(args[2]);
+    const int32_t nkv_heads = std::stoi(args[3]);
+    const auto dtype = (args[4] == "f32") ? torch::kFloat32
+        : (args[4] == "f16")              ? torch::kFloat16
                                           : torch::kBFloat16;
-    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
-
-    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
-
-    const auto options = torch::TensorOptions()
-                             .dtype(dtype)
-                             .layout(torch::kStrided)
-                             .device(torch::kCUDA, 1)
-                             .requires_grad(false);
-
-    const auto int_options = options.dtype(torch::kInt);
-    const auto Q = at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
-    const auto K = multiquery
-        ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
-              .expand({batch_size, padding, n_groups, n_heads, dim_per_head})
-        : at::rand({batch_size, padding, n_groups, n_heads, dim_per_head}, options);
-    const auto V = at::rand_like(K);
+    const int32_t n_wavefronts_per_block = std::stoi(args[5]);
+
+    auto [Q, K, V, seq] = generate_inputs(padding, batch_size, nq_heads, nkv_heads, dtype);
     auto O = at::empty_like(Q);
 
     constexpr auto splitk_dim = 0;
     constexpr auto split_k = 1;
     auto O_splits = at::stack(O, splitk_dim);
 
-    auto split_max = at::empty({batch_size, padding, n_groups, n_heads, split_k}, options.dtype(at::kFloat));
+    auto split_max = at::empty({batch_size, padding, Q.size(2), Q.size(3), split_k}, Q.options().dtype(at::kFloat));
     auto split_sumexp = at::empty_like(split_max);
 
-    const auto seq = at::randint(1, n_keys, {batch_size}, int_options);
-    const double qk_scale = 1. / sqrt(dim_per_head);
+    const double qk_scale = 1. / sqrt(Q.size(-1));
     auto call_ptr = decltype(&efficient_attention_forward_decoder_splitk_ck_out_impl<
                              kThreadsPerWavefront,
                              kWavefrontsPerBlock>){};

From 03aed2120f23152c3af426fcf117fa33e833cc31 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 6 Jan 2024 00:20:59 +0000
Subject: [PATCH 334/641] set loop unrolls to 1 in order to avoid index errors
 (will need to be fixed later for perf)

---
 .../hip_fmha/attention_forward_splitk.cpp     | 28 ++++++++++++++-----
 .../ck_attention_forward_decoder_splitk.h     |  7 +++--
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index bc73473d8..71cabfd7e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -8,7 +8,7 @@
 
 namespace {
   constexpr int32_t kThreadsPerWavefront = 64;
-  constexpr int32_t kWavefrontsPerBlock = 16;
+  constexpr int32_t kWavefrontsPerBlock = 1;
   constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
 }
 
@@ -228,6 +228,13 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
   efficient_attention_forward_decoder_splitk_ck_out_impl<
       ThreadsPerWavefront,
       WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
+  
+  auto nan_count = at::sum(at::isnan(O_splits));
+  auto numel = O_splits.numel();
+  auto inf_count = at::sum(at::isinf(O_splits));
+
+  // std::cout << "O_splits numel: " << numel << "O_splits nans: " << nan_count << "O_splits infs: " << inf_count << std::endl;
+
   return O;
 }
 
@@ -769,7 +776,9 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> generate_inputs(const
     ? at::randn({B, padding, G, Hkv, D}, options) 
     : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
   auto V = at::randn_like(K);
-  auto seqlen = at::randint(1062, 1063, {B}, int_options);
+  // auto seqlen = at::randint(1, padding + 1, {B}, int_options);
+  // auto seqlen = at::tensor({1062}, int_options);
+  auto seqlen = at::tensor({6, 12, 13, 9, 32, 10, 12, 6}, int_options);
 
   return std::make_tuple(XQ, K, V, seqlen);
 }
@@ -803,22 +812,27 @@ static void test_split1_attention() {
 }
 
 static void do_correctness_check() {
-  auto [XQ, K, V, seqlen] = generate_inputs(4096, 1, 16, 16);
+  auto [XQ, K, V, seqlen] = generate_inputs(32, 8, 16, 16);
 
   double qk_scale = 1. / sqrt(XQ.size(-1));
-  constexpr auto split_k = 1;
+  constexpr auto split_k = 2;
   
   auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
       XQ, K, V, seqlen, qk_scale, split_k);
-  auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl<64, 16>(
-      XQ, K, V, seqlen, qk_scale, split_k);
+  auto gold_result = efficient_attention_forward_decoder_split1_torch(
+      XQ, K, V, seqlen, qk_scale);
   auto mask = at::isclose(
       result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
   auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+  auto nan_count = at::sum(at::isnan(result));
+  auto numel = result.numel();
+  auto inf_count = at::sum(at::isinf(result));
   printf(
       "Mismatched elements percentage: %.2f\n",
       1. - percent_match.item<float>());
-  printf("k_seqlen: %d\n", seqlen.item<int32_t>());
+  // printf("k_seqlen: %d\n", seqlen.item<int32_t>());
+  std::cout << "numel: " << numel << " nan count: " << nan_count << " inf count: " << inf_count << std::endl;
+  std::cout << "k_seqlen: " << seqlen << std::endl;
 }
 
 int main(int argc, char** argv) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index d73da0cbc..df34dc6f7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -173,8 +173,8 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
 template <
     typename scalar_t,
     int32_t vec_size = 4,
-    int32_t n_loop_unroll = 16,
-    int32_t n_loop_unroll_tail = 2,
+    int32_t n_loop_unroll = 1,
+    int32_t n_loop_unroll_tail = 1,
     int32_t KV_M_MAX = 8192,
     typename compute_t = float>
 __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
@@ -202,7 +202,8 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
     const bool multiquery,
     const float qk_scale,
     const int32_t split_k) {
-  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
+  static_assert(n_loop_unroll_tail < n_loop_unroll || n_loop_unroll_tail == 1, 
+     "tail unroll must be smaller than main loop untoll; pragma unroll 0 is illegal (and tail is no-op)");
 
   // Each block handles a single batch and head and query and group
   const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);

From 930dda1a5233caf82aadd6d045146d52ea31f01b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 8 Jan 2024 14:56:35 -0500
Subject: [PATCH 335/641] fix output splits allocation

---
 .../attention/hip_fmha/attention_forward_splitk.cpp    | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 71cabfd7e..71c78d18b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -208,19 +208,19 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     double qk_scale,
     int64_t split_k) {
   auto O = at::empty_like(XQ);
-  constexpr auto splitk_dim = 0;
   constexpr auto rank = 5;
-  auto O_splits = at::stack(O, splitk_dim);
 
   TORCH_CHECK(XQ.dim() == rank);
   TORCH_CHECK(cache_K.dim() == rank);
   TORCH_CHECK(cache_V.dim() == rank);
-  TORCH_CHECK(O_splits.dim() == 1 + rank);
 
   auto B = XQ.size(0);
   auto M = XQ.size(1);
   auto G = XQ.size(2);
   auto H = XQ.size(3);
+  auto K = XQ.size(4);
+  
+  auto O_splits = at::empty({split_k, B, M, G, H, K}, XQ.options());
 
   auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
   auto split_sumexp = at::empty_like(split_max);
@@ -235,6 +235,10 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
 
   // std::cout << "O_splits numel: " << numel << "O_splits nans: " << nan_count << "O_splits infs: " << inf_count << std::endl;
 
+  // std::cout << "O splits at (0,0,0,0,0): " << O_splits[0][0][0][0][0][0] << " " << O_splits[1][0][0][0][0][0] << std::endl <<
+  //           "split_max: " << split_max[0][0][0][0][0] << " " << split_max[0][0][0][0][1] << std::endl <<
+  //           "split_sumexp: " << split_sumexp[0][0][0][0][0] << " " << split_sumexp[0][0][0][0][1] << std::endl;
+
   return O;
 }
 

From bd50cf4babd150d12ff3963f30fbdc1ba47e2e9d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 9 Jan 2024 16:44:12 -0500
Subject: [PATCH 336/641] fix bug in split attention: sumexp needs timestep
 bounds in each split

---
 tests/test_mem_eff_attention_ck.py            | 13 ++++----
 .../hip_fmha/attention_forward_splitk.cpp     | 16 ++--------
 .../ck_attention_forward_decoder_splitk.h     | 32 ++-----------------
 xformers/ops/fmha/forward_splitk.py           |  4 ---
 4 files changed, 12 insertions(+), 53 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index f03d9a979..5ee0ab2df 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1755,6 +1755,7 @@ def test_splitk_reference(
 @pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
 @pytest.mark.parametrize("padding", [32, 4096])
 @pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
+@pytest.mark.parametrize("d", [256])
 def test_decoder(
     op,
     n_heads: int,
@@ -1762,9 +1763,9 @@ def test_decoder(
     padding: int,
     bsz: int,
     dtype: str,
+    d: int,
     dequant: bool = False,
     num_queries: int = 1,
-    d = 256,
 ) -> None:
     # kv_heads = 1: multiquery
     # kv_heads = None: neither MQA nor GQA
@@ -1814,11 +1815,6 @@ def test_decoder(
 
     ref_output = ref_attention(q, k, v, attn_bias)
 
-    # print(f"{torch.where(decoder_output.isnan())=}")
-    # print(f"{torch.sum(decoder_output.isnan())} nans out of {decoder_output.numel()}")
-    # print(f"{torch.sum(decoder_output.isinf())} infs out of {decoder_output.numel()}")
-    # print(f"{k_seqlen=}")
-
     assert_allclose(
         decoder_output.float(),
         ref_output,
@@ -1827,10 +1823,11 @@ def test_decoder(
     )
 
 
-@pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp_S1, fmha.forward_splitk.FwOp_S2])
+@pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp_S1, fmha.forward_splitk.FwOp_S2, fmha.forward_splitk.FwOp_S4])
 @pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
+@pytest.mark.parametrize("d", [256])
 @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
 def test_splitk_decoder(
     op,
@@ -1839,6 +1836,7 @@ def test_splitk_decoder(
     padding: int,
     bsz: int,
     dtype: str,
+    d: int
 ) -> None:
     # no quantized impl compared to cuda
     test_decoder(
@@ -1848,6 +1846,7 @@ def test_splitk_decoder(
         padding=padding,
         bsz=bsz,
         dtype=dtype,
+        d=d,
     )
 
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 71c78d18b..fe73dbfbd 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -220,24 +220,14 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
   auto H = XQ.size(3);
   auto K = XQ.size(4);
   
-  auto O_splits = at::empty({split_k, B, M, G, H, K}, XQ.options());
+  auto O_splits = at::zeros({split_k, B, M, G, H, K}, XQ.options());
 
-  auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
-  auto split_sumexp = at::empty_like(split_max);
+  auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat)).fill_(ck::NumericLimits<float>::Lowest());
+  auto split_sumexp = at::zeros_like(split_max);
 
   efficient_attention_forward_decoder_splitk_ck_out_impl<
       ThreadsPerWavefront,
       WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
-  
-  auto nan_count = at::sum(at::isnan(O_splits));
-  auto numel = O_splits.numel();
-  auto inf_count = at::sum(at::isinf(O_splits));
-
-  // std::cout << "O_splits numel: " << numel << "O_splits nans: " << nan_count << "O_splits infs: " << inf_count << std::endl;
-
-  // std::cout << "O splits at (0,0,0,0,0): " << O_splits[0][0][0][0][0][0] << " " << O_splits[1][0][0][0][0][0] << std::endl <<
-  //           "split_max: " << split_max[0][0][0][0][0] << " " << split_max[0][0][0][0][1] << std::endl <<
-  //           "split_sumexp: " << split_sumexp[0][0][0][0][0] << " " << split_sumexp[0][0][0][0][1] << std::endl;
 
   return O;
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index df34dc6f7..24d57c8b4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -271,32 +271,6 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
   const int32_t tt_tail_low = wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
   const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
 
-  // if (lane_idx == 0)
-  //   printf("wavefront_idx: %d "
-  //          "t_max: %d " 
-  //          "(runtime) wavefronts_per_block: %d "
-  //          "n_loop_unroll: %d "
-  //          "n_loop_unroll_tail: %d "
-  //          "dtt: %d "
-  //          "n_unrolled_loops: %d "
-  //          "tt_low: %d "
-  //          "tt_high: %d "
-  //          "dtt_tail: %d "
-  //          "tt_tail_low: %d "
-  //          "tt_tail_high: %d "
-  //          "\n", 
-  //          wavefront_idx,
-  //          t_max, 
-  //          wavefronts_per_block,
-  //          n_loop_unroll,
-  //          n_loop_unroll_tail,
-  //          dtt,
-  //          n_unrolled_loops,
-  //          tt_low,
-  //          tt_high,
-  //          dtt_tail,
-  //          tt_tail_low,
-  //          tt_tail_high);
   for (auto tt = tt_low; tt < tt_high; tt += dtt) {
     if (lane_active_for_io) {
 #pragma unroll n_loop_unroll
@@ -380,7 +354,9 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
   // each wavefront computes partial sum of exp.
   compute_t softmax_denominator = 0.0f;
   for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+    if (t >= tt_low && t < tt_tail_high) {
+      softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+    }
   }
   softmax_denominator = wavefrontReduce(
       softmax_denominator, [](auto a, auto b) { return a + b; });
@@ -636,8 +612,6 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
         const Argument& arg,
         const StreamConfig& stream_config = StreamConfig{}) {
 
-      // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ << std::endl;
-
       auto threads_per_wavefront = arg.block_dim.x;
 
       auto Q_size_k_alignment_necessary = 0;
diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/forward_splitk.py
index 013c605a6..49238f83d 100644
--- a/xformers/ops/fmha/forward_splitk.py
+++ b/xformers/ops/fmha/forward_splitk.py
@@ -141,12 +141,8 @@ def apply(
         else:
             qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32))
 
-        print(f"{q.shape=} {k.shape=} {v.shape=}")
-
         out = cls.OPERATOR(query=query, key=key, value=value, seq_positions=seq_positions_gpu, scale=qk_scale, split_k=split_k)
         
-        print(f"{out.shape=}")
-        
         return out, None
 
 
From 60c997d03496a595e074aa3ef064ad5c9678bdbc Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 9 Jan 2024 16:58:48 -0500
Subject: [PATCH 337/641] clang-format-10

---
 xformers/csrc/attention/attention.cpp         |   72 +-
 .../hip_fmha/attention_forward_splitk.cpp     | 1576 +++++++++--------
 .../ck_attention_forward_decoder_splitk.h     | 1328 +++++++-------
 3 files changed, 1542 insertions(+), 1434 deletions(-)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index c0dcc014b..42f8216fb 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -7,37 +7,51 @@
  */
 #include <torch/types.h>
 
-TORCH_LIBRARY_FRAGMENT(xformers, m) {
+TORCH_LIBRARY_FRAGMENT(xformers, m)
+{
 #if !defined(USE_ROCM)
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_small_k(Tensor query, Tensor key, Tensor value, bool compute_logsumexp, Tensor? attn_bias, float p) -> (Tensor, Tensor, int, int)"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_cutlass(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_decoder(Tensor query, Tensor key, Tensor value, Tensor seq_positions, float scale) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_small_k(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor output, Tensor? attn_bias, float p, int rng_seed, int rng_offset) -> (Tensor, Tensor, Tensor)"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_cutlass(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_q, int max_seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale, int num_splits_key, int? window_size) -> (Tensor, Tensor, Tensor, Tensor)"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::_temp_dropout(Tensor out, float p) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_forward_small_k(Tensor query, Tensor key, Tensor value, "
+        "bool compute_logsumexp, Tensor? attn_bias, float p) -> (Tensor, Tensor, int, int)"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_forward_cutlass(Tensor query, Tensor key, Tensor value, "
+        "Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float "
+        "dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, "
+        "int? window_size) -> (Tensor, Tensor, int, int)"));
+    m.def(
+        TORCH_SELECTIVE_SCHEMA("xformers::efficient_attention_forward_decoder(Tensor query, Tensor "
+                               "key, Tensor value, Tensor seq_positions, float scale) -> Tensor"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_backward_small_k(Tensor grad_out, Tensor query, Tensor key, "
+        "Tensor value, Tensor logsumexp, Tensor output, Tensor? attn_bias, float p, int rng_seed, "
+        "int rng_offset) -> (Tensor, Tensor, Tensor)"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_backward_cutlass(Tensor grad_out, Tensor query, Tensor key, "
+        "Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_q, "
+        "int max_seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int "
+        "rng_offset, int custom_mask_type, float? scale, int num_splits_key, int? window_size) -> "
+        "(Tensor, Tensor, Tensor, Tensor)"));
+    m.def(TORCH_SELECTIVE_SCHEMA("xformers::_temp_dropout(Tensor out, float p) -> Tensor"));
+    m.def(TORCH_SELECTIVE_SCHEMA("xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 #if defined(USE_ROCM)
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
-          "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
-          "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, Tensor value, Tensor? seq_positions, float scale, int split_k) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
+    m.def(TORCH_SELECTIVE_SCHEMA("xformers::efficient_attention_forward_ck(Tensor query, "
+                                 "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
+                                 "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
+                                 "bool compute_logsumexp, int custom_mask_type, float? scale, "
+                                 "Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
+        "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, "
+        "Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? "
+        "max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int "
+        "rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, "
+        "Tensor)"));
+    m.def(TORCH_SELECTIVE_SCHEMA(
+        "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, Tensor "
+        "value, Tensor? seq_positions, float scale, int split_k) -> Tensor"));
+    m.def(TORCH_SELECTIVE_SCHEMA("xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index fe73dbfbd..61dac9a8b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -7,54 +7,57 @@
 #include "ck_attention_forward_decoder_splitk.h"
 
 namespace {
-  constexpr int32_t kThreadsPerWavefront = 64;
-  constexpr int32_t kWavefrontsPerBlock = 1;
-  constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
-}
+constexpr int32_t kThreadsPerWavefront = 64;
+constexpr int32_t kWavefrontsPerBlock  = 1;
+constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
+} // namespace
 
 static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
-  const at::Tensor& Q,
-  const at::Tensor& K,
-  const at::Tensor& V,
-  const at::Tensor& k_seqlens
-) {
-  auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
-  auto S = at::einsum("mghk, nghk -> mghn", {Q_scaled.flatten(0, 1), K.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
-
-  // for (size_t i = 0; i < S.dim(); ++i) {
-  //   std::cout << "S.dim" << i << "=" << S.size(i) << std::endl; 
-  // }
-
-  // causal mask
-  auto neg_inf = at::tensor(-99.).item();
-  for (size_t b = 0; b < k_seqlens.numel(); ++b) {
-    auto seqlen = k_seqlens[b].item<int64_t>();
-    at::slice(S[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).fill_(neg_inf);
-    at::slice(S[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ S.size(-1)).fill_(neg_inf);
-    // std::cout << "batch" << b << " ; masked QK^T dim " << S[b].dim() << " values at h0 " << S[b].slice(1, 0, 1) << std::endl;
-  }
-
-  auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
-  auto s = at::exp(at::sub(S, m));
-  
-  // causal mask
-  for (size_t b = 0; b < k_seqlens.numel(); ++b) {
-    auto seqlen = k_seqlens[b].item<int64_t>();
-    at::slice(s[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).zero_();
-    at::slice(s[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ s.size(-1)).zero_();
-  }
-
-  auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-  auto O = at::einsum("mghn, nghk -> mghk", {s, V.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
-  return std::make_tuple(O, m, l);
+    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens)
+{
+    auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
+    auto S        = at::einsum("mghk, nghk -> mghn",
+                        {Q_scaled.flatten(0, 1), K.flatten(0, 1)},
+                        /* einsum eval path */ at::nullopt);
+
+    // for (size_t i = 0; i < S.dim(); ++i) {
+    //   std::cout << "S.dim" << i << "=" << S.size(i) << std::endl;
+    // }
+
+    // causal mask
+    auto neg_inf = at::tensor(-99.).item();
+    for(size_t b = 0; b < k_seqlens.numel(); ++b)
+    {
+        auto seqlen = k_seqlens[b].item<int64_t>();
+        at::slice(S[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).fill_(neg_inf);
+        at::slice(S[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ S.size(-1))
+            .fill_(neg_inf);
+        // std::cout << "batch" << b << " ; masked QK^T dim " << S[b].dim() << " values at h0 " <<
+        // S[b].slice(1, 0, 1) << std::endl;
+    }
+
+    auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+    auto s = at::exp(at::sub(S, m));
+
+    // causal mask
+    for(size_t b = 0; b < k_seqlens.numel(); ++b)
+    {
+        auto seqlen = k_seqlens[b].item<int64_t>();
+        at::slice(s[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).zero_();
+        at::slice(s[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ s.size(-1))
+            .zero_();
+    }
+
+    auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+    auto O =
+        at::einsum("mghn, nghk -> mghk", {s, V.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
+    return std::make_tuple(O, m, l);
 }
 
-static at::Tensor split1_reduce_torch(
-  const at::Tensor& O_splits,
-  const at::Tensor& m,
-  const at::Tensor& l
-) {
-  return at::div(O_splits, l);
+static at::Tensor
+split1_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m, const at::Tensor& l)
+{
+    return at::div(O_splits, l);
 }
 
 namespace {
@@ -62,209 +65,213 @@ namespace {
 template <typename c10_t>
 struct c10_to_data_t;
 template <>
-struct c10_to_data_t<float> {
-  using type = float;
+struct c10_to_data_t<float>
+{
+    using type = float;
 };
 
 template <>
-struct c10_to_data_t<c10::Half> {
-  using type = ck::half_t;
+struct c10_to_data_t<c10::Half>
+{
+    using type = ck::half_t;
 };
 
 template <>
-struct c10_to_data_t<c10::BFloat16> {
-  using type = ck::bhalf_t;
+struct c10_to_data_t<c10::BFloat16>
+{
+    using type = ck::bhalf_t;
 };
-}
+} // namespace
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
-  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
-  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
-  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+    AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
 
-#define AT_DISPATCH_SWITCH_3(                               \
-    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(                                       \
-      TYPE,                                                 \
-      NAME,                                                 \
-      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+    AT_DISPATCH_SWITCH(                                                              \
+        TYPE, NAME, AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
 
 namespace {
 
-template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock,
+template <int32_t ThreadsPerWavefront,
+          int32_t WavefrontsPerBlock,
           int32_t KV_M_MAX = 8192,
-          int32_t K_MAX = 256>
+          int32_t K_MAX    = 256>
 at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
-    const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int64_t split_k,
     at::Tensor& split_max,
     at::Tensor& split_sumexp,
     at::Tensor& split_O,
-    at::Tensor& O) {
-  static_assert(4 * ThreadsPerWavefront == K_MAX, "");
-  static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
-
-  at::OptionalDeviceGuard guard(XQ.device());
-  TORCH_CHECK(XQ.is_cuda());
-  TORCH_CHECK(cache_K.is_cuda());
-  TORCH_CHECK(cache_V.is_cuda());
-
-  TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
-
-  TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
-  TORCH_CHECK(cache_K.size(4) <= K_MAX);
-
-  constexpr auto rank = 5;
-
-  auto B = XQ.size(0);
-  auto M = XQ.size(1);
-  auto G = XQ.size(2);
-  auto H = XQ.size(3);
-
-  TORCH_CHECK(B <= 1024);
-  TORCH_CHECK(M <= 1024);
-  TORCH_CHECK(H <= 1024);
-
-  dim3 blocks(B * H * M * G, split_k);
-  dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
-
-  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = K_MAX * sizeof(float) *
-      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-  const size_t lds_bytes = max(smem_softmax, smem_output);
-  auto stream = at::cuda::getCurrentHIPStream().stream();
-
-  AT_DISPATCH_SWITCH_3(
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      at::ScalarType::Float,
-      XQ.scalar_type(),
-      "efficient_attention_forward_decoder_splitk_ck",
-      [&] {
-        using ck_data_t = c10_to_data_t<scalar_t>::type;
-        using device_op_t =
-            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t>;
-        auto op = device_op_t{};
-
-        auto XQ_acc =
-            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-        auto K_acc =
-            cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-        auto V_acc =
-            cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-        auto split_O_acc = split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-        auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-        auto seq_acc = seq_kv_lens ?
-            seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data() : nullptr;
-        auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-        auto split_sumexp_acc = split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-        auto arg = device_op_t::Argument(
-            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
-            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
-            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
-            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-            reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
-            split_max_acc.data(),
-            split_sumexp_acc.data(),
-            seq_acc,
-            XQ_acc.stride(0),
-            XQ_acc.stride(1),
-            XQ_acc.stride(2),
-            XQ_acc.stride(3),
-            K_acc.stride(0),
-            K_acc.stride(1),
-            K_acc.stride(2),
-            K_acc.stride(3),
-            split_O_acc.stride(0),
-            XQ_acc.size(1),
-            XQ_acc.size(2),
-            XQ_acc.size(3),
-            XQ_acc.size(4),
-            K_acc.size(1),
-            K_acc.size(3) == 1,
-            qk_scale,
-            split_k,
-            blocks,
-            threads,
-            lds_bytes);
-
-        auto invoker = device_op_t::Invoker{};
-        (void)invoker.Run(arg, {stream});
-      });
-
-  return O;
+    at::Tensor& O)
+{
+    static_assert(4 * ThreadsPerWavefront == K_MAX, "");
+    static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
+
+    at::OptionalDeviceGuard guard(XQ.device());
+    TORCH_CHECK(XQ.is_cuda());
+    TORCH_CHECK(cache_K.is_cuda());
+    TORCH_CHECK(cache_V.is_cuda());
+
+    TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
+
+    TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
+    TORCH_CHECK(cache_K.size(4) <= K_MAX);
+
+    constexpr auto rank = 5;
+
+    auto B = XQ.size(0);
+    auto M = XQ.size(1);
+    auto G = XQ.size(2);
+    auto H = XQ.size(3);
+
+    TORCH_CHECK(B <= 1024);
+    TORCH_CHECK(M <= 1024);
+    TORCH_CHECK(H <= 1024);
+
+    dim3 blocks(B * H * M * G, split_k);
+    dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
+
+    int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+    int32_t smem_output  = K_MAX * sizeof(float) *
+                          threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+    const size_t lds_bytes = max(smem_softmax, smem_output);
+    auto stream            = at::cuda::getCurrentHIPStream().stream();
+
+    AT_DISPATCH_SWITCH_3(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        at::ScalarType::Float,
+        XQ.scalar_type(),
+        "efficient_attention_forward_decoder_splitk_ck",
+        [&] {
+            using ck_data_t   = c10_to_data_t<scalar_t>::type;
+            using device_op_t = ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t>;
+            auto op           = device_op_t{};
+
+            auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto K_acc  = cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+            auto V_acc  = cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+            auto split_O_acc =
+                split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+            auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto seq_acc =
+                seq_kv_lens
+                    ? seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data()
+                    : nullptr;
+            auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+            auto split_sumexp_acc =
+                split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+            auto arg = device_op_t::Argument(
+                reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+                reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+                reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+                reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
+                split_max_acc.data(),
+                split_sumexp_acc.data(),
+                seq_acc,
+                XQ_acc.stride(0),
+                XQ_acc.stride(1),
+                XQ_acc.stride(2),
+                XQ_acc.stride(3),
+                K_acc.stride(0),
+                K_acc.stride(1),
+                K_acc.stride(2),
+                K_acc.stride(3),
+                split_O_acc.stride(0),
+                XQ_acc.size(1),
+                XQ_acc.size(2),
+                XQ_acc.size(3),
+                XQ_acc.size(4),
+                K_acc.size(1),
+                K_acc.size(3) == 1,
+                qk_scale,
+                split_k,
+                blocks,
+                threads,
+                lds_bytes);
+
+            auto invoker = device_op_t::Invoker{};
+            (void)invoker.Run(arg, {stream});
+        });
+
+    return O;
 }
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
-    const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, H or 1, D]
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
-    int64_t split_k) {
-  auto O = at::empty_like(XQ);
-  constexpr auto rank = 5;
-
-  TORCH_CHECK(XQ.dim() == rank);
-  TORCH_CHECK(cache_K.dim() == rank);
-  TORCH_CHECK(cache_V.dim() == rank);
-
-  auto B = XQ.size(0);
-  auto M = XQ.size(1);
-  auto G = XQ.size(2);
-  auto H = XQ.size(3);
-  auto K = XQ.size(4);
-  
-  auto O_splits = at::zeros({split_k, B, M, G, H, K}, XQ.options());
-
-  auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat)).fill_(ck::NumericLimits<float>::Lowest());
-  auto split_sumexp = at::zeros_like(split_max);
-
-  efficient_attention_forward_decoder_splitk_ck_out_impl<
-      ThreadsPerWavefront,
-      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
-
-  return O;
+    int64_t split_k)
+{
+    auto O              = at::empty_like(XQ);
+    constexpr auto rank = 5;
+
+    TORCH_CHECK(XQ.dim() == rank);
+    TORCH_CHECK(cache_K.dim() == rank);
+    TORCH_CHECK(cache_V.dim() == rank);
+
+    auto B = XQ.size(0);
+    auto M = XQ.size(1);
+    auto G = XQ.size(2);
+    auto H = XQ.size(3);
+    auto K = XQ.size(4);
+
+    auto O_splits = at::zeros({split_k, B, M, G, H, K}, XQ.options());
+
+    auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat))
+                         .fill_(ck::NumericLimits<float>::Lowest());
+    auto split_sumexp = at::zeros_like(split_max);
+
+    efficient_attention_forward_decoder_splitk_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
+        XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
+
+    return O;
 }
 
 at::Tensor efficient_attention_forward_decoder_split1_torch(
-  const at::Tensor& XQ, // [B, 1, G, H, D]
-  const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-  const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
-  at::optional<at::Tensor> seq_kv_lens, // [B]
-  double qk_scale
-) {
-  auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
-  auto O = split1_reduce_torch(O_split, m, l);
-  return O.reshape_as(XQ);
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale)
+{
+    auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
+    auto O               = split1_reduce_torch(O_split, m, l);
+    return O.reshape_as(XQ);
 }
 
 at::Tensor efficient_attention_forward_decoder_splitk_ck(
-    const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
-    int64_t split_k) {
+    int64_t split_k)
+{
 
-  // return efficient_attention_forward_decoder_split1_torch(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
+    // return efficient_attention_forward_decoder_split1_torch(XQ, cache_K, cache_V, seq_kv_lens,
+    // qk_scale);
 
-  return efficient_attention_forward_decoder_splitk_ck_impl<
-      kThreadsPerWavefront,
-      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
+    return efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront,
+                                                              kWavefrontsPerBlock>(
+        XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
 }
 } // namespace
 
-
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_splitk_ck"),
-      TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m)
+{
+    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_splitk_ck"),
+           TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
 }
 
 #ifdef ATTN_FWD_SPLITK_DECODER_MAIN
@@ -305,595 +312,630 @@ namespace tensor_operation {
 namespace device {
 
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderSplit1DeviceOp : public BaseOperator {
-  using DeviceOp = FMHADecoderSplit1DeviceOp;
-  struct Argument : public BaseArgument {
-    const scalar_t* __restrict__ XQ;
-    const scalar_t* __restrict__ cache_K;
-    const scalar_t* __restrict__ cache_V;
-    scalar_t* __restrict__ O;
-    scalar_t* __restrict__ split_O;
-    compute_t* __restrict__ split_max;
-    compute_t* __restrict__ split_sumexp;
-    const int32_t* __restrict__ seq_kv_lens;
-    const ptrdiff_t XQ_stride_b;
-    const ptrdiff_t XQ_stride_m;
-    const ptrdiff_t XQ_stride_g;
-    const ptrdiff_t XQ_stride_h;
-    const ptrdiff_t K_stride_b;
-    const ptrdiff_t K_stride_m;
-    const ptrdiff_t K_stride_g;
-    const ptrdiff_t K_stride_h;
-    const ptrdiff_t O_stride_split;
-    const int32_t Q_size_m;
-    const int32_t Q_size_g;
-    const int32_t Q_size_h;
-    const int32_t Q_size_k;
-    const int32_t K_size_m;
-    const bool multiquery;
-    const float qk_scale;
-    const int32_t split_k;
-
-    const dim3 grid_dim;
-    const dim3 block_dim;
-    const size_t lds_bytes;
-
-    Argument(
-        const scalar_t* __restrict__ XQ,
-        const scalar_t* __restrict__ cache_K,
-        const scalar_t* __restrict__ cache_V,
-        scalar_t* __restrict__ O,
-        scalar_t* __restrict__ split_O,
-        compute_t* __restrict__ split_max,
-        compute_t* __restrict__ split_sumexp,
-        const int32_t* __restrict__ seq_kv_lens,
-        const ptrdiff_t XQ_stride_b,
-        const ptrdiff_t XQ_stride_m,
-        const ptrdiff_t XQ_stride_g,
-        const ptrdiff_t XQ_stride_h,
-        const ptrdiff_t K_stride_b,
-        const ptrdiff_t K_stride_m,
-        const ptrdiff_t K_stride_g,
-        const ptrdiff_t K_stride_h,
-        const ptrdiff_t O_stride_split,
-        const int32_t Q_size_m,
-        const int32_t Q_size_g,
-        const int32_t Q_size_h,
-        const int32_t Q_size_k,
-        const int32_t K_size_m,
-        const bool multiquery,
-        const float qk_scale,
-        const int32_t split_k,
-        // launch params
-        const dim3 grid_dim,
-        const dim3 block_dim,
-        const size_t lds_bytes)
-        : XQ(XQ),
-          cache_K(cache_K),
-          cache_V(cache_V),
-          O(O),
-          split_O(split_O),
-          split_max(split_max),
-          split_sumexp(split_sumexp),
-          seq_kv_lens(seq_kv_lens),
-          XQ_stride_b(XQ_stride_b),
-          XQ_stride_m(XQ_stride_m),
-          XQ_stride_g(XQ_stride_g),
-          XQ_stride_h(XQ_stride_h),
-          K_stride_b(K_stride_b),
-          K_stride_m(K_stride_m),
-          K_stride_g(K_stride_g),
-          K_stride_h(K_stride_h),
-          O_stride_split(O_stride_split),
-          Q_size_m(Q_size_m),
-          Q_size_g(Q_size_g),
-          Q_size_h(Q_size_h),
-          Q_size_k(Q_size_k),
-          K_size_m(K_size_m),
-          multiquery(multiquery),
-          qk_scale(qk_scale),
-          split_k(split_k),
-          // launch params
-          grid_dim(grid_dim),
-          block_dim(block_dim),
-          lds_bytes(lds_bytes) {}
-
-      std::string str() const {
-        std::ostringstream oss;
-        oss << "Argument { " << std::endl <<
-        "    XQ: " << XQ << std::endl <<
-        "    cache_K: " << cache_K << std::endl <<
-        "    cache_V: " << cache_V << std::endl << 
-        "    O: " << O << std::endl <<
-        "    split_O: " << split_O << std::endl <<
-        "    split_max: " << split_max << std::endl <<
-        "    split_sumexp: " << split_sumexp << std::endl <<
-        "    seq_kv_lens: " << seq_kv_lens << std::endl <<
-        "    XQ_stride_b: " << XQ_stride_b << std::endl <<
-        "    XQ_stride_m: " << XQ_stride_m << std::endl <<
-        "    XQ_stride_g: " << XQ_stride_g << std::endl <<
-        "    XQ_stride_h: " << XQ_stride_h << std::endl <<
-        "    K_stride_b: " << K_stride_b << std::endl <<
-        "    K_stride_m: " << K_stride_m << std::endl <<
-        "    K_stride_g: " << K_stride_g << std::endl <<
-        "    K_stride_h: " << K_stride_h << std::endl <<
-        "    O_stride_split: " << O_stride_split << std::endl <<
-        "    Q_size_m: " << Q_size_m << std::endl <<
-        "    Q_size_g: " << Q_size_g << std::endl <<
-        "    Q_size_h: " << Q_size_h << std::endl <<
-        "    Q_size_k: " << Q_size_k << std::endl <<
-        "    K_size_m: " << K_size_m << std::endl <<
-        "    multiquery: " << multiquery << std::endl <<
-        "    qk_scale: " << qk_scale << std::endl <<
-        "    split_k: " << split_k << std::endl <<
-        std::endl <<
-        "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z << std::endl <<
-        "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z << std::endl <<
-        "    lds_bytes: " << lds_bytes << std::endl <<
-        "}";
-        return oss.str();
-      }
-  };
-
-  struct Invoker : public BaseInvoker {
-    using Argument = DeviceOp::Argument;
-    float Run(
-        const Argument& arg,
-        const StreamConfig& stream_config = StreamConfig{}) {
-
-      // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ << std::endl;
-
-      auto threads_per_wavefront = arg.block_dim.x;
-
-      auto Q_size_k_alignment_necessary = 0;
-
-      for (auto vec_size : {4, 2, 1}) {
-        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
-          Q_size_k_alignment_necessary = vec_size;
+struct FMHADecoderSplit1DeviceOp : public BaseOperator
+{
+    using DeviceOp = FMHADecoderSplit1DeviceOp;
+    struct Argument : public BaseArgument
+    {
+        const scalar_t* __restrict__ XQ;
+        const scalar_t* __restrict__ cache_K;
+        const scalar_t* __restrict__ cache_V;
+        scalar_t* __restrict__ O;
+        scalar_t* __restrict__ split_O;
+        compute_t* __restrict__ split_max;
+        compute_t* __restrict__ split_sumexp;
+        const int32_t* __restrict__ seq_kv_lens;
+        const ptrdiff_t XQ_stride_b;
+        const ptrdiff_t XQ_stride_m;
+        const ptrdiff_t XQ_stride_g;
+        const ptrdiff_t XQ_stride_h;
+        const ptrdiff_t K_stride_b;
+        const ptrdiff_t K_stride_m;
+        const ptrdiff_t K_stride_g;
+        const ptrdiff_t K_stride_h;
+        const ptrdiff_t O_stride_split;
+        const int32_t Q_size_m;
+        const int32_t Q_size_g;
+        const int32_t Q_size_h;
+        const int32_t Q_size_k;
+        const int32_t K_size_m;
+        const bool multiquery;
+        const float qk_scale;
+        const int32_t split_k;
+
+        const dim3 grid_dim;
+        const dim3 block_dim;
+        const size_t lds_bytes;
+
+        Argument(const scalar_t* __restrict__ XQ,
+                 const scalar_t* __restrict__ cache_K,
+                 const scalar_t* __restrict__ cache_V,
+                 scalar_t* __restrict__ O,
+                 scalar_t* __restrict__ split_O,
+                 compute_t* __restrict__ split_max,
+                 compute_t* __restrict__ split_sumexp,
+                 const int32_t* __restrict__ seq_kv_lens,
+                 const ptrdiff_t XQ_stride_b,
+                 const ptrdiff_t XQ_stride_m,
+                 const ptrdiff_t XQ_stride_g,
+                 const ptrdiff_t XQ_stride_h,
+                 const ptrdiff_t K_stride_b,
+                 const ptrdiff_t K_stride_m,
+                 const ptrdiff_t K_stride_g,
+                 const ptrdiff_t K_stride_h,
+                 const ptrdiff_t O_stride_split,
+                 const int32_t Q_size_m,
+                 const int32_t Q_size_g,
+                 const int32_t Q_size_h,
+                 const int32_t Q_size_k,
+                 const int32_t K_size_m,
+                 const bool multiquery,
+                 const float qk_scale,
+                 const int32_t split_k,
+                 // launch params
+                 const dim3 grid_dim,
+                 const dim3 block_dim,
+                 const size_t lds_bytes)
+            : XQ(XQ),
+              cache_K(cache_K),
+              cache_V(cache_V),
+              O(O),
+              split_O(split_O),
+              split_max(split_max),
+              split_sumexp(split_sumexp),
+              seq_kv_lens(seq_kv_lens),
+              XQ_stride_b(XQ_stride_b),
+              XQ_stride_m(XQ_stride_m),
+              XQ_stride_g(XQ_stride_g),
+              XQ_stride_h(XQ_stride_h),
+              K_stride_b(K_stride_b),
+              K_stride_m(K_stride_m),
+              K_stride_g(K_stride_g),
+              K_stride_h(K_stride_h),
+              O_stride_split(O_stride_split),
+              Q_size_m(Q_size_m),
+              Q_size_g(Q_size_g),
+              Q_size_h(Q_size_h),
+              Q_size_k(Q_size_k),
+              K_size_m(K_size_m),
+              multiquery(multiquery),
+              qk_scale(qk_scale),
+              split_k(split_k),
+              // launch params
+              grid_dim(grid_dim),
+              block_dim(block_dim),
+              lds_bytes(lds_bytes)
+        {
         }
-      }
-
-      if (!Q_size_k_alignment_necessary) {
-        throw std::runtime_error("Unsupported Q_size_k");
-      }
-
-      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
-        throw std::runtime_error("Unsupported alignment for Q_size_k");
-      }
-
-      float split_attention_result = launch_and_time_kernel(
-          stream_config,
-          Q_size_k_alignment_necessary == 4
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
-              : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
-              : nullptr,
-          arg.grid_dim,
-          arg.block_dim,
-          arg.lds_bytes,
-          arg.XQ,
-          arg.cache_K,
-          arg.cache_V,
-          arg.split_O,
-          arg.split_max,
-          arg.split_sumexp,
-          arg.seq_kv_lens,
-          arg.XQ_stride_b,
-          arg.XQ_stride_m,
-          arg.XQ_stride_g,
-          arg.XQ_stride_h,
-          arg.K_stride_b,
-          arg.K_stride_m,
-          arg.K_stride_g,
-          arg.K_stride_h,
-          arg.O_stride_split,
-          arg.Q_size_m,
-          arg.Q_size_g,
-          arg.Q_size_h,
-          arg.Q_size_k,
-          arg.K_size_m,
-          arg.multiquery,
-          arg.qk_scale,
-          arg.split_k);
-
-        return split_attention_result;
-    }
-  };
+
+        std::string str() const
+        {
+            std::ostringstream oss;
+            oss << "Argument { " << std::endl
+                << "    XQ: " << XQ << std::endl
+                << "    cache_K: " << cache_K << std::endl
+                << "    cache_V: " << cache_V << std::endl
+                << "    O: " << O << std::endl
+                << "    split_O: " << split_O << std::endl
+                << "    split_max: " << split_max << std::endl
+                << "    split_sumexp: " << split_sumexp << std::endl
+                << "    seq_kv_lens: " << seq_kv_lens << std::endl
+                << "    XQ_stride_b: " << XQ_stride_b << std::endl
+                << "    XQ_stride_m: " << XQ_stride_m << std::endl
+                << "    XQ_stride_g: " << XQ_stride_g << std::endl
+                << "    XQ_stride_h: " << XQ_stride_h << std::endl
+                << "    K_stride_b: " << K_stride_b << std::endl
+                << "    K_stride_m: " << K_stride_m << std::endl
+                << "    K_stride_g: " << K_stride_g << std::endl
+                << "    K_stride_h: " << K_stride_h << std::endl
+                << "    O_stride_split: " << O_stride_split << std::endl
+                << "    Q_size_m: " << Q_size_m << std::endl
+                << "    Q_size_g: " << Q_size_g << std::endl
+                << "    Q_size_h: " << Q_size_h << std::endl
+                << "    Q_size_k: " << Q_size_k << std::endl
+                << "    K_size_m: " << K_size_m << std::endl
+                << "    multiquery: " << multiquery << std::endl
+                << "    qk_scale: " << qk_scale << std::endl
+                << "    split_k: " << split_k << std::endl
+                << std::endl
+                << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z
+                << std::endl
+                << "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z
+                << std::endl
+                << "    lds_bytes: " << lds_bytes << std::endl
+                << "}";
+            return oss.str();
+        }
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+
+            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ <<
+            // std::endl;
+
+            auto threads_per_wavefront = arg.block_dim.x;
+
+            auto Q_size_k_alignment_necessary = 0;
+
+            for(auto vec_size : {4, 2, 1})
+            {
+                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                {
+                    Q_size_k_alignment_necessary = vec_size;
+                }
+            }
+
+            if(!Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported Q_size_k");
+            }
+
+            if(arg.Q_size_k % Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported alignment for Q_size_k");
+            }
+
+            float split_attention_result = launch_and_time_kernel(
+                stream_config,
+                Q_size_k_alignment_necessary == 4
+                    ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
+                    : Q_size_k_alignment_necessary == 2
+                          ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
+                          : Q_size_k_alignment_necessary == 1
+                                ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
+                                : nullptr,
+                arg.grid_dim,
+                arg.block_dim,
+                arg.lds_bytes,
+                arg.XQ,
+                arg.cache_K,
+                arg.cache_V,
+                arg.split_O,
+                arg.split_max,
+                arg.split_sumexp,
+                arg.seq_kv_lens,
+                arg.XQ_stride_b,
+                arg.XQ_stride_m,
+                arg.XQ_stride_g,
+                arg.XQ_stride_h,
+                arg.K_stride_b,
+                arg.K_stride_m,
+                arg.K_stride_g,
+                arg.K_stride_h,
+                arg.O_stride_split,
+                arg.Q_size_m,
+                arg.Q_size_g,
+                arg.Q_size_h,
+                arg.Q_size_k,
+                arg.K_size_m,
+                arg.multiquery,
+                arg.qk_scale,
+                arg.split_k);
+
+            return split_attention_result;
+        }
+    };
 };
 
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderReduceDeviceOp : public BaseOperator {
-  using DeviceOp = FMHADecoderReduceDeviceOp;
-  struct Argument : public BaseArgument {
-    const scalar_t* __restrict__ XQ;
-    const scalar_t* __restrict__ cache_K;
-    const scalar_t* __restrict__ cache_V;
-    scalar_t* __restrict__ O;
-    scalar_t* __restrict__ split_O;
-    compute_t* __restrict__ split_max;
-    compute_t* __restrict__ split_sumexp;
-    const int32_t* __restrict__ seq_kv_lens;
-    const ptrdiff_t XQ_stride_b;
-    const ptrdiff_t XQ_stride_m;
-    const ptrdiff_t XQ_stride_g;
-    const ptrdiff_t XQ_stride_h;
-    const ptrdiff_t K_stride_b;
-    const ptrdiff_t K_stride_m;
-    const ptrdiff_t K_stride_g;
-    const ptrdiff_t K_stride_h;
-    const ptrdiff_t O_stride_split;
-    const int32_t Q_size_m;
-    const int32_t Q_size_g;
-    const int32_t Q_size_h;
-    const int32_t Q_size_k;
-    const int32_t K_size_m;
-    const bool multiquery;
-    const float qk_scale;
-    const int32_t split_k;
-
-    const dim3 grid_dim;
-    const dim3 block_dim;
-    const size_t lds_bytes;
-
-    Argument(
-        const scalar_t* __restrict__ XQ,
-        const scalar_t* __restrict__ cache_K,
-        const scalar_t* __restrict__ cache_V,
-        scalar_t* __restrict__ O,
-        scalar_t* __restrict__ split_O,
-        compute_t* __restrict__ split_max,
-        compute_t* __restrict__ split_sumexp,
-        const int32_t* __restrict__ seq_kv_lens,
-        const ptrdiff_t XQ_stride_b,
-        const ptrdiff_t XQ_stride_m,
-        const ptrdiff_t XQ_stride_g,
-        const ptrdiff_t XQ_stride_h,
-        const ptrdiff_t K_stride_b,
-        const ptrdiff_t K_stride_m,
-        const ptrdiff_t K_stride_g,
-        const ptrdiff_t K_stride_h,
-        const ptrdiff_t O_stride_split,
-        const int32_t Q_size_m,
-        const int32_t Q_size_g,
-        const int32_t Q_size_h,
-        const int32_t Q_size_k,
-        const int32_t K_size_m,
-        const bool multiquery,
-        const float qk_scale,
-        const int32_t split_k,
-        // launch params
-        const dim3 grid_dim,
-        const dim3 block_dim,
-        const size_t lds_bytes)
-        : XQ(XQ),
-          cache_K(cache_K),
-          cache_V(cache_V),
-          O(O),
-          split_O(split_O),
-          split_max(split_max),
-          split_sumexp(split_sumexp),
-          seq_kv_lens(seq_kv_lens),
-          XQ_stride_b(XQ_stride_b),
-          XQ_stride_m(XQ_stride_m),
-          XQ_stride_g(XQ_stride_g),
-          XQ_stride_h(XQ_stride_h),
-          K_stride_b(K_stride_b),
-          K_stride_m(K_stride_m),
-          K_stride_g(K_stride_g),
-          K_stride_h(K_stride_h),
-          O_stride_split(O_stride_split),
-          Q_size_m(Q_size_m),
-          Q_size_g(Q_size_g),
-          Q_size_h(Q_size_h),
-          Q_size_k(Q_size_k),
-          K_size_m(K_size_m),
-          multiquery(multiquery),
-          qk_scale(qk_scale),
-          split_k(split_k),
-          // launch params
-          grid_dim(grid_dim),
-          block_dim(block_dim),
-          lds_bytes(lds_bytes) {}
-  };
-
-  struct Invoker : public BaseInvoker {
-    using Argument = DeviceOp::Argument;
-    float Run(
-        const Argument& arg,
-        const StreamConfig& stream_config = StreamConfig{}) {
-      auto threads_per_wavefront = arg.block_dim.x;
-
-      auto Q_size_k_alignment_necessary = 0;
-
-      for (auto vec_size : {4, 2, 1}) {
-        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
-          Q_size_k_alignment_necessary = vec_size;
+struct FMHADecoderReduceDeviceOp : public BaseOperator
+{
+    using DeviceOp = FMHADecoderReduceDeviceOp;
+    struct Argument : public BaseArgument
+    {
+        const scalar_t* __restrict__ XQ;
+        const scalar_t* __restrict__ cache_K;
+        const scalar_t* __restrict__ cache_V;
+        scalar_t* __restrict__ O;
+        scalar_t* __restrict__ split_O;
+        compute_t* __restrict__ split_max;
+        compute_t* __restrict__ split_sumexp;
+        const int32_t* __restrict__ seq_kv_lens;
+        const ptrdiff_t XQ_stride_b;
+        const ptrdiff_t XQ_stride_m;
+        const ptrdiff_t XQ_stride_g;
+        const ptrdiff_t XQ_stride_h;
+        const ptrdiff_t K_stride_b;
+        const ptrdiff_t K_stride_m;
+        const ptrdiff_t K_stride_g;
+        const ptrdiff_t K_stride_h;
+        const ptrdiff_t O_stride_split;
+        const int32_t Q_size_m;
+        const int32_t Q_size_g;
+        const int32_t Q_size_h;
+        const int32_t Q_size_k;
+        const int32_t K_size_m;
+        const bool multiquery;
+        const float qk_scale;
+        const int32_t split_k;
+
+        const dim3 grid_dim;
+        const dim3 block_dim;
+        const size_t lds_bytes;
+
+        Argument(const scalar_t* __restrict__ XQ,
+                 const scalar_t* __restrict__ cache_K,
+                 const scalar_t* __restrict__ cache_V,
+                 scalar_t* __restrict__ O,
+                 scalar_t* __restrict__ split_O,
+                 compute_t* __restrict__ split_max,
+                 compute_t* __restrict__ split_sumexp,
+                 const int32_t* __restrict__ seq_kv_lens,
+                 const ptrdiff_t XQ_stride_b,
+                 const ptrdiff_t XQ_stride_m,
+                 const ptrdiff_t XQ_stride_g,
+                 const ptrdiff_t XQ_stride_h,
+                 const ptrdiff_t K_stride_b,
+                 const ptrdiff_t K_stride_m,
+                 const ptrdiff_t K_stride_g,
+                 const ptrdiff_t K_stride_h,
+                 const ptrdiff_t O_stride_split,
+                 const int32_t Q_size_m,
+                 const int32_t Q_size_g,
+                 const int32_t Q_size_h,
+                 const int32_t Q_size_k,
+                 const int32_t K_size_m,
+                 const bool multiquery,
+                 const float qk_scale,
+                 const int32_t split_k,
+                 // launch params
+                 const dim3 grid_dim,
+                 const dim3 block_dim,
+                 const size_t lds_bytes)
+            : XQ(XQ),
+              cache_K(cache_K),
+              cache_V(cache_V),
+              O(O),
+              split_O(split_O),
+              split_max(split_max),
+              split_sumexp(split_sumexp),
+              seq_kv_lens(seq_kv_lens),
+              XQ_stride_b(XQ_stride_b),
+              XQ_stride_m(XQ_stride_m),
+              XQ_stride_g(XQ_stride_g),
+              XQ_stride_h(XQ_stride_h),
+              K_stride_b(K_stride_b),
+              K_stride_m(K_stride_m),
+              K_stride_g(K_stride_g),
+              K_stride_h(K_stride_h),
+              O_stride_split(O_stride_split),
+              Q_size_m(Q_size_m),
+              Q_size_g(Q_size_g),
+              Q_size_h(Q_size_h),
+              Q_size_k(Q_size_k),
+              K_size_m(K_size_m),
+              multiquery(multiquery),
+              qk_scale(qk_scale),
+              split_k(split_k),
+              // launch params
+              grid_dim(grid_dim),
+              block_dim(block_dim),
+              lds_bytes(lds_bytes)
+        {
         }
-      }
-
-      if (!Q_size_k_alignment_necessary) {
-        throw std::runtime_error("Unsupported Q_size_k");
-      }
-
-      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
-        throw std::runtime_error("Unsupported alignment for Q_size_k");
-      }
-
-      const dim3 reduce_gridsize = {arg.grid_dim.x};
-      const dim3 reduce_blocksize = {arg.block_dim.x};
-      constexpr int32_t reduce_lds_bytes = 0;
-      float reduce_result = launch_and_time_kernel(
-        stream_config,
-        Q_size_k_alignment_necessary == 4
-            ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
-            : Q_size_k_alignment_necessary == 2
-            ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
-            : Q_size_k_alignment_necessary == 1
-            ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 1>
-            : nullptr,
-        reduce_gridsize,
-        reduce_blocksize,
-        reduce_lds_bytes, 
-        arg.split_O,
-        arg.split_max,
-        arg.split_sumexp,
-        arg.O,
-        arg.Q_size_m,
-        arg.Q_size_g,
-        arg.Q_size_h,
-        arg.Q_size_k,
-        arg.O_stride_split,
-        arg.XQ_stride_b,
-        arg.XQ_stride_m,
-        arg.XQ_stride_g,
-        arg.XQ_stride_h,
-        arg.split_k
-      );
-      return reduce_result;
-    }
-  };
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            auto threads_per_wavefront = arg.block_dim.x;
+
+            auto Q_size_k_alignment_necessary = 0;
+
+            for(auto vec_size : {4, 2, 1})
+            {
+                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                {
+                    Q_size_k_alignment_necessary = vec_size;
+                }
+            }
+
+            if(!Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported Q_size_k");
+            }
+
+            if(arg.Q_size_k % Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported alignment for Q_size_k");
+            }
+
+            const dim3 reduce_gridsize         = {arg.grid_dim.x};
+            const dim3 reduce_blocksize        = {arg.block_dim.x};
+            constexpr int32_t reduce_lds_bytes = 0;
+            float reduce_result                = launch_and_time_kernel(
+                stream_config,
+                Q_size_k_alignment_necessary == 4
+                    ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
+                    : Q_size_k_alignment_necessary == 2
+                          ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
+                          : Q_size_k_alignment_necessary == 1
+                                ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                                      scalar_t,
+                                      1>
+                                : nullptr,
+                reduce_gridsize,
+                reduce_blocksize,
+                reduce_lds_bytes,
+                arg.split_O,
+                arg.split_max,
+                arg.split_sumexp,
+                arg.O,
+                arg.Q_size_m,
+                arg.Q_size_g,
+                arg.Q_size_h,
+                arg.Q_size_k,
+                arg.O_stride_split,
+                arg.XQ_stride_b,
+                arg.XQ_stride_m,
+                arg.XQ_stride_g,
+                arg.XQ_stride_h,
+                arg.split_k);
+            return reduce_result;
+        }
+    };
 };
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(
-  const at::Tensor& XQ, 
-  const at::Tensor& K, 
-  const at::Tensor& V, 
-  const at::Tensor& seqlen) {
-
-  at::OptionalDeviceGuard guard(XQ.device());
-
-  auto B = XQ.size(0);
-  auto M = XQ.size(1);
-  auto G = XQ.size(2);
-  auto H = XQ.size(3);
-  auto D = XQ.size(4);
-
-  double qk_scale = 1. / sqrt(D);
-  constexpr auto split_k = 1;
-
-  auto O = at::empty_like(XQ);
-  constexpr auto splitk_dim = 0;
-  constexpr auto rank = 5;
-  auto split_O = at::stack(O, splitk_dim);
-  auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
-  auto split_sumexp = at::empty_like(split_max);
-
-  dim3 blocks(B * H * M * G, split_k);
-  dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
-
-  constexpr int32_t KV_M_MAX = 8192;
-  constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
-
-  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = K_MAX * sizeof(float) *
-      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-  const size_t lds_bytes = max(smem_softmax, smem_output);
-  auto stream = at::cuda::getCurrentHIPStream().stream();
-
-  AT_DISPATCH_SWITCH_3(
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      at::ScalarType::Float,
-      XQ.scalar_type(),
-      "efficient_attention_forward_decoder_split1_ck_test",
-      [&] {
-        using ck_data_t = c10_to_data_t<scalar_t>::type;
-        using device_op_t =
-            ck::tensor_operation::device::FMHADecoderSplit1DeviceOp<ck_data_t>;
-        auto op = device_op_t{};
-
-        auto XQ_acc =
-            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-        auto K_acc =
-            K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-        auto V_acc =
-            V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-        auto split_O_acc = split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-        auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-        auto seq_acc = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data();
-        auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-        auto split_sumexp_acc = split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-        auto arg = device_op_t::Argument(
-            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
-            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
-            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
-            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-            reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
-            split_max_acc.data(),
-            split_sumexp_acc.data(),
-            seq_acc,
-            XQ_acc.stride(0),
-            XQ_acc.stride(1),
-            XQ_acc.stride(2),
-            XQ_acc.stride(3),
-            K_acc.stride(0),
-            K_acc.stride(1),
-            K_acc.stride(2),
-            K_acc.stride(3),
-            split_O_acc.stride(0),
-            XQ_acc.size(1),
-            XQ_acc.size(2),
-            XQ_acc.size(3),
-            XQ_acc.size(4),
-            K_acc.size(1),
-            K_acc.size(3) == 1,
-            qk_scale,
-            split_k,
-            blocks,
-            threads,
-            lds_bytes);
-
-        auto invoker = device_op_t::Invoker{};
-        (void)invoker.Run(arg, {stream});
-      });
-  return std::make_tuple(split_O[splitk_dim], split_max, split_sumexp);
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(const at::Tensor& XQ,
+                                                                           const at::Tensor& K,
+                                                                           const at::Tensor& V,
+                                                                           const at::Tensor& seqlen)
+{
+
+    at::OptionalDeviceGuard guard(XQ.device());
+
+    auto B = XQ.size(0);
+    auto M = XQ.size(1);
+    auto G = XQ.size(2);
+    auto H = XQ.size(3);
+    auto D = XQ.size(4);
+
+    double qk_scale        = 1. / sqrt(D);
+    constexpr auto split_k = 1;
+
+    auto O                    = at::empty_like(XQ);
+    constexpr auto splitk_dim = 0;
+    constexpr auto rank       = 5;
+    auto split_O              = at::stack(O, splitk_dim);
+    auto split_max            = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
+    auto split_sumexp         = at::empty_like(split_max);
+
+    dim3 blocks(B * H * M * G, split_k);
+    dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
+
+    constexpr int32_t KV_M_MAX = 8192;
+    constexpr int32_t K_MAX    = 4 * kThreadsPerWavefront;
+
+    int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+    int32_t smem_output  = K_MAX * sizeof(float) *
+                          threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+    const size_t lds_bytes = max(smem_softmax, smem_output);
+    auto stream            = at::cuda::getCurrentHIPStream().stream();
+
+    AT_DISPATCH_SWITCH_3(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        at::ScalarType::Float,
+        XQ.scalar_type(),
+        "efficient_attention_forward_decoder_split1_ck_test",
+        [&] {
+            using ck_data_t   = c10_to_data_t<scalar_t>::type;
+            using device_op_t = ck::tensor_operation::device::FMHADecoderSplit1DeviceOp<ck_data_t>;
+            auto op           = device_op_t{};
+
+            auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto K_acc  = K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+            auto V_acc  = V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+            auto split_O_acc =
+                split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+            auto O_acc   = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto seq_acc = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data();
+            auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+            auto split_sumexp_acc =
+                split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+            auto arg = device_op_t::Argument(
+                reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+                reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+                reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+                reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
+                split_max_acc.data(),
+                split_sumexp_acc.data(),
+                seq_acc,
+                XQ_acc.stride(0),
+                XQ_acc.stride(1),
+                XQ_acc.stride(2),
+                XQ_acc.stride(3),
+                K_acc.stride(0),
+                K_acc.stride(1),
+                K_acc.stride(2),
+                K_acc.stride(3),
+                split_O_acc.stride(0),
+                XQ_acc.size(1),
+                XQ_acc.size(2),
+                XQ_acc.size(3),
+                XQ_acc.size(4),
+                K_acc.size(1),
+                K_acc.size(3) == 1,
+                qk_scale,
+                split_k,
+                blocks,
+                threads,
+                lds_bytes);
+
+            auto invoker = device_op_t::Invoker{};
+            (void)invoker.Run(arg, {stream});
+        });
+    return std::make_tuple(split_O[splitk_dim], split_max, split_sumexp);
 }
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> generate_inputs(const int32_t padding, const int32_t B, const int32_t Hq, const int32_t Hkv, const decltype(torch::kFloat32) dtype = torch::kFloat32) {
-  const int32_t D = 4 * kThreadsPerWavefront;
-  const int32_t G = Hq / Hkv;
-  const int32_t num_queries = 1;
-
-  auto options = torch::TensorOptions()
-                     .dtype(dtype)
-                     .layout(torch::kStrided)
-                     .device(torch::kCUDA, 1)
-                     .requires_grad(false);
-  auto int_options = options.dtype(torch::kInt);
-  auto XQ = at::randn({B, num_queries, G, Hq, D}, options);
-  auto K = (G == 1) 
-    ? at::randn({B, padding, G, Hkv, D}, options) 
-    : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
-  auto V = at::randn_like(K);
-  // auto seqlen = at::randint(1, padding + 1, {B}, int_options);
-  // auto seqlen = at::tensor({1062}, int_options);
-  auto seqlen = at::tensor({6, 12, 13, 9, 32, 10, 12, 6}, int_options);
-
-  return std::make_tuple(XQ, K, V, seqlen);
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+generate_inputs(const int32_t padding,
+                const int32_t B,
+                const int32_t Hq,
+                const int32_t Hkv,
+                const decltype(torch::kFloat32) dtype = torch::kFloat32)
+{
+    const int32_t D           = 4 * kThreadsPerWavefront;
+    const int32_t G           = Hq / Hkv;
+    const int32_t num_queries = 1;
+
+    auto options = torch::TensorOptions()
+                       .dtype(dtype)
+                       .layout(torch::kStrided)
+                       .device(torch::kCUDA, 1)
+                       .requires_grad(false);
+    auto int_options = options.dtype(torch::kInt);
+    auto XQ          = at::randn({B, num_queries, G, Hq, D}, options);
+    auto K           = (G == 1) ? at::randn({B, padding, G, Hkv, D}, options)
+                      : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
+    auto V = at::randn_like(K);
+    // auto seqlen = at::randint(1, padding + 1, {B}, int_options);
+    // auto seqlen = at::tensor({1062}, int_options);
+    auto seqlen = at::tensor({6, 12, 13, 9, 32, 10, 12, 6}, int_options);
+
+    return std::make_tuple(XQ, K, V, seqlen);
 }
 
-static void test_split1_attention() {
-  auto [XQ, K, V, seqlen] = generate_inputs(4096, 1, 16, 16);
-  
-  auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
+static void test_split1_attention()
+{
+    auto [XQ, K, V, seqlen] = generate_inputs(4096, 1, 16, 16);
 
-  auto hip_result = split1_attention_hip(XQ, K, V, seqlen);
+    auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
 
-  auto O_match_mask = at::isclose(std::get<0>(reference_result), std::get<0>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto m_match_mask = at::isclose(std::get<1>(reference_result), std::get<1>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto l_match_mask = at::isclose(std::get<2>(reference_result), std::get<2>(hip_result), /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+    auto hip_result = split1_attention_hip(XQ, K, V, seqlen);
 
-  auto O_percent_match = at::sum(O_match_mask.to(torch::kFloat32)) / O_match_mask.numel();
-  auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
-  auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
+    auto O_match_mask = at::isclose(std::get<0>(reference_result),
+                                    std::get<0>(hip_result),
+                                    /*atol*/ 1e-3,
+                                    /*rtol*/ 1e-5,
+                                    /*equal_nan*/ false);
+    auto m_match_mask = at::isclose(std::get<1>(reference_result),
+                                    std::get<1>(hip_result),
+                                    /*atol*/ 1e-3,
+                                    /*rtol*/ 1e-5,
+                                    /*equal_nan*/ false);
+    auto l_match_mask = at::isclose(std::get<2>(reference_result),
+                                    std::get<2>(hip_result),
+                                    /*atol*/ 1e-3,
+                                    /*rtol*/ 1e-5,
+                                    /*equal_nan*/ false);
 
-  printf(
-      "Mismatched split_O elements percentage: %.2f\n",
-      1. - O_percent_match.item<float>());
+    auto O_percent_match = at::sum(O_match_mask.to(torch::kFloat32)) / O_match_mask.numel();
+    auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
+    auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
 
-  printf(
-      "Mismatched split_max elements percentage: %.2f\n",
-      1. - m_percent_match.item<float>());
+    printf("Mismatched split_O elements percentage: %.2f\n", 1. - O_percent_match.item<float>());
 
-  printf(
-      "Mismatched split_sumexp elements percentage: %.2f\n",
-      1. - m_percent_match.item<float>());
+    printf("Mismatched split_max elements percentage: %.2f\n", 1. - m_percent_match.item<float>());
+
+    printf("Mismatched split_sumexp elements percentage: %.2f\n",
+           1. - m_percent_match.item<float>());
 }
 
-static void do_correctness_check() {
-  auto [XQ, K, V, seqlen] = generate_inputs(32, 8, 16, 16);
-
-  double qk_scale = 1. / sqrt(XQ.size(-1));
-  constexpr auto split_k = 2;
-  
-  auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
-      XQ, K, V, seqlen, qk_scale, split_k);
-  auto gold_result = efficient_attention_forward_decoder_split1_torch(
-      XQ, K, V, seqlen, qk_scale);
-  auto mask = at::isclose(
-      result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-  auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-  auto nan_count = at::sum(at::isnan(result));
-  auto numel = result.numel();
-  auto inf_count = at::sum(at::isinf(result));
-  printf(
-      "Mismatched elements percentage: %.2f\n",
-      1. - percent_match.item<float>());
-  // printf("k_seqlen: %d\n", seqlen.item<int32_t>());
-  std::cout << "numel: " << numel << " nan count: " << nan_count << " inf count: " << inf_count << std::endl;
-  std::cout << "k_seqlen: " << seqlen << std::endl;
+static void do_correctness_check()
+{
+    auto [XQ, K, V, seqlen] = generate_inputs(32, 8, 16, 16);
+
+    double qk_scale        = 1. / sqrt(XQ.size(-1));
+    constexpr auto split_k = 2;
+
+    auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
+        XQ, K, V, seqlen, qk_scale, split_k);
+    auto gold_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
+    auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+    auto nan_count     = at::sum(at::isnan(result));
+    auto numel         = result.numel();
+    auto inf_count     = at::sum(at::isinf(result));
+    printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
+    // printf("k_seqlen: %d\n", seqlen.item<int32_t>());
+    std::cout << "numel: " << numel << " nan count: " << nan_count << " inf count: " << inf_count
+              << std::endl;
+    std::cout << "k_seqlen: " << seqlen << std::endl;
 }
 
-int main(int argc, char** argv) {
-  if (argc == 1) {
-    do_correctness_check();
-
-    // test_split1_attention();
-  } else {
-    const auto args = std::vector<std::string>(argv + 1, argv + argc);
-    if (args.size() != 6) {
-      std::cout
-          << "Usage: ./a.out padding batch_size nq_heads nkv_heads dtype n_wavefronts_per_block"
-          << std::endl;
-      return 0;
-    }
-    const int32_t padding = std::stoi(args[0]);
-    const int32_t batch_size = std::stoi(args[1]);
-    const int32_t nq_heads = std::stoi(args[2]);
-    const int32_t nkv_heads = std::stoi(args[3]);
-    const auto dtype = (args[4] == "f32") ? torch::kFloat32
-        : (args[4] == "f16")              ? torch::kFloat16
-                                          : torch::kBFloat16;
-    const int32_t n_wavefronts_per_block = std::stoi(args[5]);
-
-    auto [Q, K, V, seq] = generate_inputs(padding, batch_size, nq_heads, nkv_heads, dtype);
-    auto O = at::empty_like(Q);
+int main(int argc, char** argv)
+{
+    if(argc == 1)
+    {
+        do_correctness_check();
 
-    constexpr auto splitk_dim = 0;
-    constexpr auto split_k = 1;
-    auto O_splits = at::stack(O, splitk_dim);
-
-    auto split_max = at::empty({batch_size, padding, Q.size(2), Q.size(3), split_k}, Q.options().dtype(at::kFloat));
-    auto split_sumexp = at::empty_like(split_max);
-
-    const double qk_scale = 1. / sqrt(Q.size(-1));
-    auto call_ptr = decltype(&efficient_attention_forward_decoder_splitk_ck_out_impl<
-                             kThreadsPerWavefront,
-                             kWavefrontsPerBlock>){};
-
-#define SWITCH_CASE_SET_CALLPTR(n)                               \
-  case (n):                                                      \
-    call_ptr = &efficient_attention_forward_decoder_splitk_ck_out_impl< \
-        kThreadsPerWavefront,                                    \
-        (n)>;                                                    \
-    break;
-
-    switch (n_wavefronts_per_block) {
-      SWITCH_CASE_SET_CALLPTR(1);
-      SWITCH_CASE_SET_CALLPTR(2);
-      SWITCH_CASE_SET_CALLPTR(4);
-      SWITCH_CASE_SET_CALLPTR(8);
-      SWITCH_CASE_SET_CALLPTR(16);
-
-      default:
-        call_ptr = nullptr;
-        break;
+        // test_split1_attention();
     }
+    else
+    {
+        const auto args = std::vector<std::string>(argv + 1, argv + argc);
+        if(args.size() != 6)
+        {
+            std::cout << "Usage: ./a.out padding batch_size nq_heads nkv_heads dtype "
+                         "n_wavefronts_per_block"
+                      << std::endl;
+            return 0;
+        }
+        const int32_t padding    = std::stoi(args[0]);
+        const int32_t batch_size = std::stoi(args[1]);
+        const int32_t nq_heads   = std::stoi(args[2]);
+        const int32_t nkv_heads  = std::stoi(args[3]);
+        const auto dtype         = (args[4] == "f32")
+                               ? torch::kFloat32
+                               : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+        const int32_t n_wavefronts_per_block = std::stoi(args[5]);
+
+        auto [Q, K, V, seq] = generate_inputs(padding, batch_size, nq_heads, nkv_heads, dtype);
+        auto O              = at::empty_like(Q);
+
+        constexpr auto splitk_dim = 0;
+        constexpr auto split_k    = 1;
+        auto O_splits             = at::stack(O, splitk_dim);
+
+        auto split_max    = at::empty({batch_size, padding, Q.size(2), Q.size(3), split_k},
+                                   Q.options().dtype(at::kFloat));
+        auto split_sumexp = at::empty_like(split_max);
+
+        const double qk_scale = 1. / sqrt(Q.size(-1));
+        auto call_ptr         = decltype(
+            &efficient_attention_forward_decoder_splitk_ck_out_impl<kThreadsPerWavefront,
+                                                                    kWavefrontsPerBlock>){};
+
+#define SWITCH_CASE_SET_CALLPTR(n)                                                              \
+    case(n):                                                                                    \
+        call_ptr =                                                                              \
+            &efficient_attention_forward_decoder_splitk_ck_out_impl<kThreadsPerWavefront, (n)>; \
+        break;
+
+        switch(n_wavefronts_per_block)
+        {
+            SWITCH_CASE_SET_CALLPTR(1);
+            SWITCH_CASE_SET_CALLPTR(2);
+            SWITCH_CASE_SET_CALLPTR(4);
+            SWITCH_CASE_SET_CALLPTR(8);
+            SWITCH_CASE_SET_CALLPTR(16);
+
+        default: call_ptr = nullptr; break;
+        }
 #undef SWITCH_CASE_SET_CALLPTR
 
-    if (call_ptr) {
-      call_ptr(Q, K, V, seq, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
-    } else {
-      std::cout << "Warning: no kernel was found for wavefronts_per_block="
-                << n_wavefronts_per_block << std::endl;
+        if(call_ptr)
+        {
+            call_ptr(Q, K, V, seq, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
+        }
+        else
+        {
+            std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                      << n_wavefronts_per_block << std::endl;
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 
 #endif // MAIN
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 24d57c8b4..d2086405b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -7,467 +7,508 @@
 #include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
 
-
 namespace {
 
 template <typename data_t, int32_t vec_size>
-__device__ typename ck::vector_type<float, vec_size>::type scalar_scale_acc(
-    typename ck::vector_type<float, vec_size>::type acc,
-    typename ck::vector_type<data_t, vec_size>::type a,
-    float b) {
-  union {
-    decltype(acc) vec;
-    float arr[vec_size];
-  } acc_u{acc};
-  union {
-    decltype(a) vec;
-    data_t arr[vec_size];
-  } a_u{a};
+__device__ typename ck::vector_type<float, vec_size>::type
+scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc,
+                 typename ck::vector_type<data_t, vec_size>::type a,
+                 float b)
+{
+    union
+    {
+        decltype(acc) vec;
+        float arr[vec_size];
+    } acc_u{acc};
+    union
+    {
+        decltype(a) vec;
+        data_t arr[vec_size];
+    } a_u{a};
 
 #pragma unroll
-  for (int32_t i = 0; i < vec_size; ++i) {
-    acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
-  }
+    for(int32_t i = 0; i < vec_size; ++i)
+    {
+        acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
+    }
 
-  return acc_u.vec;
+    return acc_u.vec;
 }
 
 template <typename F, int32_t n_threads_per_wavefront = 64>
-float __device__ __forceinline__ wavefrontReduce(float val, F f) {
+float __device__ __forceinline__ wavefrontReduce(float val, F f)
+{
 #pragma unroll
-  for (int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1) {
-    val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
-  }
-  return val;
+    for(int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1)
+    {
+        val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
+    }
+    return val;
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void load_v(
-    const TData* __restrict__ data_ptr,
-    int32_t vector_offset,
-    TDataVec* __restrict__ load_to) {
-  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+__forceinline__ __device__ void
+load_v(const TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec* __restrict__ load_to)
+{
+    *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void store_v(
-    TData* __restrict__ data_ptr,
-    int32_t vector_offset,
-    TDataVec value) {
-  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+__forceinline__ __device__ void
+store_v(TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec value)
+{
+    *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
-template<
-typename scalar_t,
-int32_t vec_size = 4,
-typename compute_t = float
-> 
+template <typename scalar_t, int32_t vec_size = 4, typename compute_t = float>
 __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
-  const scalar_t* __restrict__ O_splits,
-  const compute_t* __restrict__ split_max,
-  const compute_t* __restrict__ split_sumexp,
-  scalar_t* __restrict__ O,
-  const int32_t Q_size_m,
-  const int32_t Q_size_g,
-  const int32_t Q_size_h,
-  const int32_t Q_size_k,
-  const ptrdiff_t O_stride_split,
-  const ptrdiff_t O_stride_b,
-  const ptrdiff_t O_stride_m,
-  const ptrdiff_t O_stride_g,
-  const ptrdiff_t O_stride_h,
-  const int32_t split_k
-) {
-  
-  // Each block handles a single batch and head and query and group
-  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
-  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
-  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
-  const int32_t h = blockIdx.x % Q_size_h;
-
-  using data_t = scalar_t;
-  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
-  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
-
-  union {
-    data_vec_t vec;
-    data_t arr[vec_size];
-  } O_split_data;
-  union {
-    compute_vec_t vec;
-    compute_t arr[vec_size];
-  } O_split_compute;
-  union {
-    data_vec_t vec;
-    data_t arr[vec_size];
-  } global_O_data;
-  union {
-    compute_vec_t vec;
-    compute_t arr[vec_size];
-  } global_O_compute;
-
-  global_O_compute.vec = 0;
-
-  const int32_t lane_idx = threadIdx.x;
-  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
-
-  if (!lane_active_for_io) {
-    return;
-  }
-
-  // for s in slices:
-  //       attn_slice = s["attn_slice"]
-  //       m = s["row_max"]
-  //       l = s["row_lse"]
-  //       m_new = torch.max(m, m_current_max)
-  //       assert not m_new.isnan().any(), "m_new is nan"
-  //       pick_new = m < m_current_max
-  //       pick_our = torch.logical_not(pick_new)
-
-  //       log_alpha = -torch.abs(m - m_current_max)
-  //       log_alpha[log_alpha.isnan()] = 0
-  //       alpha = torch.exp(log_alpha)
-  //       assert not alpha.isnan().any(), "alpha is nan"
-  //       out = out + attn_slice + (pick_our * out + pick_new * attn_slice) * (torch.sub(alpha, 1))
-  //       assert not out.isnan().any(), "out acc is nan"
-  //       l_current_sum = l_current_sum + l + (pick_our * l_current_sum + pick_new * l) * (torch.sub(alpha, 1))
-  //       assert not l_current_sum.isnan().any(), "l acc is nan"
-  //       m_current_max = m_new
-  //   out /= l_current_sum
-
-  compute_t new_max = 0;
-  compute_t global_sumexp = 0;
-  compute_t global_max = ck::NumericLimits<compute_t>::Lowest();
-
-  for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
-    load_v<data_t, data_vec_t>(O_splits 
-                               + b * O_stride_b 
-                               + m * O_stride_m 
-                               + g * O_stride_g 
-                               + h * O_stride_h
-                               + split_idx * O_stride_split, lane_idx, &O_split_data.vec);                       
-    #pragma unroll
-    for (int32_t i = 0; i < vec_size; ++i) {
-      O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
-    }
-    compute_t local_max = *(split_max + blockIdx.x * split_k + split_idx); 
-    compute_t local_sumexp = *(split_sumexp + blockIdx.x * split_k + split_idx); 
-    new_max = ck::math::max(local_max, global_max);
-    bool pick_new = local_max < global_max;
-    compute_t log_alpha = -std::abs(local_max - global_max);
-    compute_t alpha = isnan(log_alpha) ? compute_t{1} : ck::math::exp(log_alpha);
-    compute_t pick_current_coef = (1 + (1 - pick_new) * (alpha - 1));
-    compute_t pick_new_coef = (1 + pick_new * (alpha - 1));
-    global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
-    global_O_compute.vec = pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
-    global_max = new_max;
-  }
-  global_O_compute.vec /= global_sumexp;
-  #pragma unroll
-  for (int32_t i = 0; i < vec_size; ++i) {
-    global_O_data.arr[i] = ck::type_convert<data_t>(global_O_compute.arr[i]);
-  }
-  store_v<data_t, data_vec_t>(O + b * O_stride_b + m * O_stride_m + g * O_stride_g + h * O_stride_h, lane_idx, global_O_data.vec);
-}
-
-template <
-    typename scalar_t,
-    int32_t vec_size = 4,
-    int32_t n_loop_unroll = 1,
-    int32_t n_loop_unroll_tail = 1,
-    int32_t KV_M_MAX = 8192,
-    typename compute_t = float>
-__global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
-    const scalar_t* __restrict__ XQ,
-    const scalar_t* __restrict__ cache_K,
-    const scalar_t* __restrict__ cache_V,
-    scalar_t* __restrict__ O_splits,
-    compute_t* __restrict__ split_max,
-    compute_t* __restrict__ split_sumexp,
-    const int32_t* __restrict__ seq_kv_lens,
-    const ptrdiff_t XQ_stride_b,
-    const ptrdiff_t XQ_stride_m,
-    const ptrdiff_t XQ_stride_g,
-    const ptrdiff_t XQ_stride_h,
-    const ptrdiff_t K_stride_b,
-    const ptrdiff_t K_stride_m,
-    const ptrdiff_t K_stride_g,
-    const ptrdiff_t K_stride_h,
-    const ptrdiff_t O_stride_split,
+    const scalar_t* __restrict__ O_splits,
+    const compute_t* __restrict__ split_max,
+    const compute_t* __restrict__ split_sumexp,
+    scalar_t* __restrict__ O,
     const int32_t Q_size_m,
     const int32_t Q_size_g,
     const int32_t Q_size_h,
     const int32_t Q_size_k,
-    const int32_t K_size_m,
-    const bool multiquery,
-    const float qk_scale,
-    const int32_t split_k) {
-  static_assert(n_loop_unroll_tail < n_loop_unroll || n_loop_unroll_tail == 1, 
-     "tail unroll must be smaller than main loop untoll; pragma unroll 0 is illegal (and tail is no-op)");
-
-  // Each block handles a single batch and head and query and group
-  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
-  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
-  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
-  const int32_t h = blockIdx.x % Q_size_h;
-  const int32_t split_idx = blockIdx.y;
-
-  // Note: this is decoding case where we attend to current and all previous
-  // tokens.
-  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
-
-  const int32_t lane_idx = threadIdx.x;
-  const int32_t wavefront_idx = threadIdx.y;
-  const int32_t threads_per_wavefront = blockDim.x;
-  const int32_t wavefronts_per_block = blockDim.y;
-  const int32_t threads_per_block =
-      threads_per_wavefront * wavefronts_per_block;
-  const int32_t thread_linear_idx =
-      lane_idx + wavefront_idx * threads_per_wavefront;
-  // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
-  const auto XQO_base_offset =
-      b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
-  const auto* __restrict__ q_ = XQ + XQO_base_offset;
-
-  const auto cache_KV_base_offset =
-      b * K_stride_b + 0 * K_stride_m + g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
-  const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
-  const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
-
-  using data_t = scalar_t;
-  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
-  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
-
-  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
-
-  extern __shared__ __align__(16) compute_t smem[];
-
-  data_vec_t q_thread = 0;
-  // Load Q into registers in all wavefronts.
-  // Each thread handles `vec_size` D dimensions
-  if (lane_active_for_io) {
-    load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
-  }
-
-  compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
-
-  // Compute S[0:t_max] = 
-  // ```
-  // for t in range(t_max): 
-  //   S[t] = dot(Q, K[t])
-  // ```
-  // Split the 0:t_max range across wavefronts in a block, 
-  // unroll loads to expose more parallelism.
-  // Reduce the dot product with cross-lane operation;
-  // Q and K[t] are in the registers of threads in a single wavefront.
-
-  data_vec_t k_loads[n_loop_unroll] = {};
-
-  const auto dtt = wavefronts_per_block * n_loop_unroll;
-  const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
-  const int32_t tt_low = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
-  const int32_t tt_high = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * (split_idx + 1);
-  const int32_t dtt_tail = wavefronts_per_block * n_loop_unroll_tail;
-  const int32_t tt_tail_low = wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
-  const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
-
-  for (auto tt = tt_low; tt < tt_high; tt += dtt) {
-    if (lane_active_for_io) {
-#pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        const int32_t t = tt + ttt;
-        // load the K[b][t][g][h|0][:] row into registers
-        load_v<data_t, data_vec_t>(
-            cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-      }
+    const ptrdiff_t O_stride_split,
+    const ptrdiff_t O_stride_b,
+    const ptrdiff_t O_stride_m,
+    const ptrdiff_t O_stride_g,
+    const ptrdiff_t O_stride_h,
+    const int32_t split_k)
+{
+
+    // Each block handles a single batch and head and query and group
+    const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+    const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+    const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+    const int32_t h = blockIdx.x % Q_size_h;
+
+    using data_t        = scalar_t;
+    using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
+    using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+    union
+    {
+        data_vec_t vec;
+        data_t arr[vec_size];
+    } O_split_data;
+    union
+    {
+        compute_vec_t vec;
+        compute_t arr[vec_size];
+    } O_split_compute;
+    union
+    {
+        data_vec_t vec;
+        data_t arr[vec_size];
+    } global_O_data;
+    union
+    {
+        compute_vec_t vec;
+        compute_t arr[vec_size];
+    } global_O_compute;
+
+    global_O_compute.vec = 0;
+
+    const int32_t lane_idx        = threadIdx.x;
+    const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+    if(!lane_active_for_io)
+    {
+        return;
     }
-    compute_t qk_accs[n_loop_unroll] = {};
-#pragma unroll n_loop_unroll
-    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-      ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-          q_thread, k_loads[ttt], qk_accs[ttt]);
-      qk_accs[ttt] *= qk_scale;
-
-      qk_accs[ttt] =
-          wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
-      max_qk_acc = ck::math::max(qk_accs[ttt], max_qk_acc);
+
+    // for s in slices:
+    //       attn_slice = s["attn_slice"]
+    //       m = s["row_max"]
+    //       l = s["row_lse"]
+    //       m_new = torch.max(m, m_current_max)
+    //       assert not m_new.isnan().any(), "m_new is nan"
+    //       pick_new = m < m_current_max
+    //       pick_our = torch.logical_not(pick_new)
+
+    //       log_alpha = -torch.abs(m - m_current_max)
+    //       log_alpha[log_alpha.isnan()] = 0
+    //       alpha = torch.exp(log_alpha)
+    //       assert not alpha.isnan().any(), "alpha is nan"
+    //       out = out + attn_slice + (pick_our * out + pick_new * attn_slice) * (torch.sub(alpha,
+    //       1)) assert not out.isnan().any(), "out acc is nan" l_current_sum = l_current_sum + l +
+    //       (pick_our * l_current_sum + pick_new * l) * (torch.sub(alpha, 1)) assert not
+    //       l_current_sum.isnan().any(), "l acc is nan" m_current_max = m_new
+    //   out /= l_current_sum
+
+    compute_t new_max       = 0;
+    compute_t global_sumexp = 0;
+    compute_t global_max    = ck::NumericLimits<compute_t>::Lowest();
+
+    for(int32_t split_idx = 0; split_idx < split_k; ++split_idx)
+    {
+        load_v<data_t, data_vec_t>(O_splits + b * O_stride_b + m * O_stride_m + g * O_stride_g +
+                                       h * O_stride_h + split_idx * O_stride_split,
+                                   lane_idx,
+                                   &O_split_data.vec);
+#pragma unroll
+        for(int32_t i = 0; i < vec_size; ++i)
+        {
+            O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
+        }
+        compute_t local_max         = *(split_max + blockIdx.x * split_k + split_idx);
+        compute_t local_sumexp      = *(split_sumexp + blockIdx.x * split_k + split_idx);
+        new_max                     = ck::math::max(local_max, global_max);
+        bool pick_new               = local_max < global_max;
+        compute_t log_alpha         = -std::abs(local_max - global_max);
+        compute_t alpha             = isnan(log_alpha) ? compute_t{1} : ck::math::exp(log_alpha);
+        compute_t pick_current_coef = (1 + (1 - pick_new) * (alpha - 1));
+        compute_t pick_new_coef     = (1 + pick_new * (alpha - 1));
+        global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
+        global_O_compute.vec =
+            pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
+        global_max = new_max;
     }
-    if (lane_idx == 0) {
-      auto* __restrict__ smem_base = smem + tt;
+    global_O_compute.vec /= global_sumexp;
+#pragma unroll
+    for(int32_t i = 0; i < vec_size; ++i)
+    {
+        global_O_data.arr[i] = ck::type_convert<data_t>(global_O_compute.arr[i]);
+    }
+    store_v<data_t, data_vec_t>(O + b * O_stride_b + m * O_stride_m + g * O_stride_g +
+                                    h * O_stride_h,
+                                lane_idx,
+                                global_O_data.vec);
+}
+
+template <typename scalar_t,
+          int32_t vec_size           = 4,
+          int32_t n_loop_unroll      = 1,
+          int32_t n_loop_unroll_tail = 1,
+          int32_t KV_M_MAX           = 8192,
+          typename compute_t         = float>
+__global__ void
+efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict__ XQ,
+                                                     const scalar_t* __restrict__ cache_K,
+                                                     const scalar_t* __restrict__ cache_V,
+                                                     scalar_t* __restrict__ O_splits,
+                                                     compute_t* __restrict__ split_max,
+                                                     compute_t* __restrict__ split_sumexp,
+                                                     const int32_t* __restrict__ seq_kv_lens,
+                                                     const ptrdiff_t XQ_stride_b,
+                                                     const ptrdiff_t XQ_stride_m,
+                                                     const ptrdiff_t XQ_stride_g,
+                                                     const ptrdiff_t XQ_stride_h,
+                                                     const ptrdiff_t K_stride_b,
+                                                     const ptrdiff_t K_stride_m,
+                                                     const ptrdiff_t K_stride_g,
+                                                     const ptrdiff_t K_stride_h,
+                                                     const ptrdiff_t O_stride_split,
+                                                     const int32_t Q_size_m,
+                                                     const int32_t Q_size_g,
+                                                     const int32_t Q_size_h,
+                                                     const int32_t Q_size_k,
+                                                     const int32_t K_size_m,
+                                                     const bool multiquery,
+                                                     const float qk_scale,
+                                                     const int32_t split_k)
+{
+    static_assert(n_loop_unroll_tail < n_loop_unroll || n_loop_unroll_tail == 1,
+                  "tail unroll must be smaller than main loop untoll; pragma unroll 0 is illegal "
+                  "(and tail is no-op)");
+
+    // Each block handles a single batch and head and query and group
+    const int32_t b         = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+    const int32_t m         = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+    const int32_t g         = (blockIdx.x / Q_size_h) % Q_size_g;
+    const int32_t h         = blockIdx.x % Q_size_h;
+    const int32_t split_idx = blockIdx.y;
+
+    // Note: this is decoding case where we attend to current and all previous
+    // tokens.
+    const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
+
+    const int32_t lane_idx              = threadIdx.x;
+    const int32_t wavefront_idx         = threadIdx.y;
+    const int32_t threads_per_wavefront = blockDim.x;
+    const int32_t wavefronts_per_block  = blockDim.y;
+    const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
+    const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
+    // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
+    const auto XQO_base_offset =
+        b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
+    const auto* __restrict__ q_ = XQ + XQO_base_offset;
+
+    const auto cache_KV_base_offset =
+        b * K_stride_b + 0 * K_stride_m + g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
+    const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
+    const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
+
+    using data_t        = scalar_t;
+    using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
+    using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+    const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+    extern __shared__ __align__(16) compute_t smem[];
+
+    data_vec_t q_thread = 0;
+    // Load Q into registers in all wavefronts.
+    // Each thread handles `vec_size` D dimensions
+    if(lane_active_for_io)
+    {
+        load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
+    }
+
+    compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+
+    // Compute S[0:t_max] =
+    // ```
+    // for t in range(t_max):
+    //   S[t] = dot(Q, K[t])
+    // ```
+    // Split the 0:t_max range across wavefronts in a block,
+    // unroll loads to expose more parallelism.
+    // Reduce the dot product with cross-lane operation;
+    // Q and K[t] are in the registers of threads in a single wavefront.
+
+    data_vec_t k_loads[n_loop_unroll] = {};
+
+    const auto dtt              = wavefronts_per_block * n_loop_unroll;
+    const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
+    const int32_t tt_low = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
+    const int32_t tt_high =
+        wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * (split_idx + 1);
+    const int32_t dtt_tail = wavefronts_per_block * n_loop_unroll_tail;
+    const int32_t tt_tail_low =
+        wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
+    const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
+
+    for(auto tt = tt_low; tt < tt_high; tt += dtt)
+    {
+        if(lane_active_for_io)
+        {
 #pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        smem_base[ttt] = qk_accs[ttt];
-      }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                // load the K[b][t][g][h|0][:] row into registers
+                load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+            }
+        }
+        compute_t qk_accs[n_loop_unroll] = {};
+#pragma unroll n_loop_unroll
+        for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+        {
+            ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+                q_thread, k_loads[ttt], qk_accs[ttt]);
+            qk_accs[ttt] *= qk_scale;
+
+            qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
+            max_qk_acc   = ck::math::max(qk_accs[ttt], max_qk_acc);
+        }
+        if(lane_idx == 0)
+        {
+            auto* __restrict__ smem_base = smem + tt;
+#pragma unroll n_loop_unroll
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                smem_base[ttt] = qk_accs[ttt];
+            }
+        }
     }
-  }
 
-  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-  for (auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail) {
-    if (lane_active_for_io) {
+    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+    for(auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail)
+    {
+        if(lane_active_for_io)
+        {
 #pragma unroll n_loop_unroll_tail
-      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-        const int32_t t = tt + ttt;
-        if (t < t_max) {
-          // load the K[b][t][g][h|0][:] row into registers
-          load_v<data_t, data_vec_t>(
-              cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
+                {
+                    // load the K[b][t][g][h|0][:] row into registers
+                    load_v<data_t, data_vec_t>(
+                        cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                }
+            }
         }
-      }
-    }
 #pragma unroll n_loop_unroll_tail
-    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-      compute_t qk_acc = 0;
-      const int32_t t = tt + ttt;
-      if (t < t_max) {
-        ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-            q_thread, k_loads[ttt], qk_acc);
-        qk_acc *= qk_scale;
-
-        qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-        max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
-
-        // write accumulated sums to smem.
-        if (lane_idx == 0) {
-          smem[t] = qk_acc;
+        for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+        {
+            compute_t qk_acc = 0;
+            const int32_t t  = tt + ttt;
+            if(t < t_max)
+            {
+                ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+                    q_thread, k_loads[ttt], qk_acc);
+                qk_acc *= qk_scale;
+
+                qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+                max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
+
+                // write accumulated sums to smem.
+                if(lane_idx == 0)
+                {
+                    smem[t] = qk_acc;
+                }
+            }
         }
-      }
     }
-  }
-
-  // Use shared reduction to compute max and compute softmax on shared memory.
-  // write max acc
-  if (lane_idx == 0) {
-    smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
-  }
-  __syncthreads();
-  if (lane_idx < wavefronts_per_block) {
-    max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
-  }
-  // shared across all threads in block
-  max_qk_acc =
-      wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
-
-  if (wavefront_idx == 0 && lane_idx == 0) {
-    split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
-  }
-
-  // each wavefront computes partial sum of exp.
-  compute_t softmax_denominator = 0.0f;
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
-    if (t >= tt_low && t < tt_tail_high) {
-      softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+
+    // Use shared reduction to compute max and compute softmax on shared memory.
+    // write max acc
+    if(lane_idx == 0)
+    {
+        smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+    }
+    __syncthreads();
+    if(lane_idx < wavefronts_per_block)
+    {
+        max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
+    }
+    // shared across all threads in block
+    max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+
+    if(wavefront_idx == 0 && lane_idx == 0)
+    {
+        split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
+    }
+
+    // each wavefront computes partial sum of exp.
+    compute_t softmax_denominator = 0.0f;
+    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    {
+        if(t >= tt_low && t < tt_tail_high)
+        {
+            softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+        }
+    }
+    softmax_denominator =
+        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+
+    if(lane_idx == 0)
+    {
+        smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
+    }
+    __syncthreads();
+
+    // now, compute sum of exp(x - max(x)) over all intermediate results.
+    softmax_denominator = 0.0;
+    if(lane_idx < wavefronts_per_block)
+    {
+        softmax_denominator = smem[KV_M_MAX + lane_idx];
+    }
+    softmax_denominator =
+        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+
+    if(wavefront_idx == 0 && lane_idx == 0)
+    {
+        split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
     }
-  }
-  softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](auto a, auto b) { return a + b; });
-
-  if (lane_idx == 0) {
-    smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
-  }
-  __syncthreads();
-
-  // now, compute sum of exp(x - max(x)) over all intermediate results.
-  softmax_denominator = 0.0;
-  if (lane_idx < wavefronts_per_block) {
-    softmax_denominator = smem[KV_M_MAX + lane_idx];
-  }
-  softmax_denominator = wavefrontReduce(
-      softmax_denominator, [](auto a, auto b) { return a + b; });
-
-  if (wavefront_idx == 0 && lane_idx == 0) {
-    split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
-  }
-  
-  // now, compute the normalization across all threads.
-  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) { 
-    // softmax scale by sumexp will happen in the reduction kernel
-    smem[t] = ck::math::exp(smem[t] - max_qk_acc);
-  }
-  __syncthreads();
-
-  // Split T across wavefronts in a block
-  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-  // outputs are of size float[D]
-
-  compute_t ps[n_loop_unroll] = {};
-  compute_vec_t o_acc = 0;
-  if (lane_active_for_io) {
-    for (auto tt = tt_low; tt < tt_high; tt += dtt) {
+
+    // now, compute the normalization across all threads.
+    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    {
+        // softmax scale by sumexp will happen in the reduction kernel
+        smem[t] = ck::math::exp(smem[t] - max_qk_acc);
+    }
+    __syncthreads();
+
+    // Split T across wavefronts in a block
+    // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+    // outputs are of size float[D]
+
+    compute_t ps[n_loop_unroll] = {};
+    compute_vec_t o_acc         = 0;
+    if(lane_active_for_io)
+    {
+        for(auto tt = tt_low; tt < tt_high; tt += dtt)
+        {
 #pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        const int32_t t = tt + ttt;
-        // load the V[b][t][g][h|0][:] row into registers, reusing K register
-        // storage
-        load_v<data_t, data_vec_t>(
-            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-        ps[ttt] = smem[t];
-      }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                // load the V[b][t][g][h|0][:] row into registers, reusing K register
+                // storage
+                load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                ps[ttt] = smem[t];
+            }
 
 #pragma unroll n_loop_unroll
-      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
-        o_acc =
-            scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-      }
-    }
+            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            {
+                o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+            }
+        }
 
-    for (auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail) {
+        for(auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail)
+        {
 #pragma unroll n_loop_unroll_tail
-      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-        const int32_t t = tt + ttt;
-        if (t < t_max) {
-          // load the V[b][t][g][h|0][:] row into registers, reusing K register
-          // storage
-          load_v<data_t, data_vec_t>(
-              cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-          ps[ttt] = smem[t];
-        }
-      }
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
+                {
+                    // load the V[b][t][g][h|0][:] row into registers, reusing K register
+                    // storage
+                    load_v<data_t, data_vec_t>(
+                        cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+                    ps[ttt] = smem[t];
+                }
+            }
 
 #pragma unroll n_loop_unroll_tail
-      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
-        const int32_t t = tt + ttt;
-        if (t < t_max) {
-          o_acc =
-              scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
+            {
+                const int32_t t = tt + ttt;
+                if(t < t_max)
+                {
+                    o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+                }
+            }
         }
-      }
     }
-  }
-  // now, each thread has partial sums. Write to smem and get accumulated
-  // results back.
-  __syncthreads();
-
-  // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
-  if (lane_active_for_io) {
-    store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
-  }
-
-  __syncthreads();
-  // sum up partial D rows from other wavefronts
-  if (wavefront_idx == 0 && lane_active_for_io) {
-    union {
-      compute_vec_t vec = 0;
-      compute_t arr[vec_size];
-    } r;
-    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
-      compute_vec_t partial_r;
-      load_v<compute_t, compute_vec_t>(
-          smem, w * threads_per_wavefront + lane_idx, &partial_r);
-      r.vec += partial_r;
+    // now, each thread has partial sums. Write to smem and get accumulated
+    // results back.
+    __syncthreads();
+
+    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+    if(lane_active_for_io)
+    {
+        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
     }
-    // elementwise convert from compute_t result to data_t out to be written
-    union {
-      data_vec_t vec;
-      data_t arr[vec_size];
-    } bf_r;
+
+    __syncthreads();
+    // sum up partial D rows from other wavefronts
+    if(wavefront_idx == 0 && lane_active_for_io)
+    {
+        union
+        {
+            compute_vec_t vec = 0;
+            compute_t arr[vec_size];
+        } r;
+        for(int32_t w = 0; w < wavefronts_per_block; ++w)
+        {
+            compute_vec_t partial_r;
+            load_v<compute_t, compute_vec_t>(
+                smem, w * threads_per_wavefront + lane_idx, &partial_r);
+            r.vec += partial_r;
+        }
+        // elementwise convert from compute_t result to data_t out to be written
+        union
+        {
+            data_vec_t vec;
+            data_t arr[vec_size];
+        } bf_r;
 #pragma unroll
-    for (int32_t i = 0; i < vec_size; ++i) {
-      bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+        for(int32_t i = 0; i < vec_size; ++i)
+        {
+            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
+        }
+        // write output row O[b][m][g][h][:]
+        data_t* __restrict__ o_ = O_splits + XQO_base_offset + split_idx * O_stride_split;
+        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
     }
-    // write output row O[b][m][g][h][:]
-    data_t* __restrict__ o_ = O_splits + XQO_base_offset + split_idx * O_stride_split;
-    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
-  }
 }
 
 } // namespace
@@ -476,230 +517,241 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderSplitKDeviceOp : public BaseOperator {
-  using DeviceOp = FMHADecoderSplitKDeviceOp;
-  struct Argument : public BaseArgument {
-    const scalar_t* __restrict__ XQ;
-    const scalar_t* __restrict__ cache_K;
-    const scalar_t* __restrict__ cache_V;
-    scalar_t* __restrict__ O;
-    scalar_t* __restrict__ split_O;
-    compute_t* __restrict__ split_max;
-    compute_t* __restrict__ split_sumexp;
-    const int32_t* __restrict__ seq_kv_lens;
-    const ptrdiff_t XQ_stride_b;
-    const ptrdiff_t XQ_stride_m;
-    const ptrdiff_t XQ_stride_g;
-    const ptrdiff_t XQ_stride_h;
-    const ptrdiff_t K_stride_b;
-    const ptrdiff_t K_stride_m;
-    const ptrdiff_t K_stride_g;
-    const ptrdiff_t K_stride_h;
-    const ptrdiff_t O_stride_split;
-    const int32_t Q_size_m;
-    const int32_t Q_size_g;
-    const int32_t Q_size_h;
-    const int32_t Q_size_k;
-    const int32_t K_size_m;
-    const bool multiquery;
-    const float qk_scale;
-    const int32_t split_k;
-
-    const dim3 grid_dim;
-    const dim3 block_dim;
-    const size_t lds_bytes;
-
-    Argument(
-        const scalar_t* __restrict__ XQ,
-        const scalar_t* __restrict__ cache_K,
-        const scalar_t* __restrict__ cache_V,
-        scalar_t* __restrict__ O,
-        scalar_t* __restrict__ split_O,
-        compute_t* __restrict__ split_max,
-        compute_t* __restrict__ split_sumexp,
-        const int32_t* __restrict__ seq_kv_lens,
-        const ptrdiff_t XQ_stride_b,
-        const ptrdiff_t XQ_stride_m,
-        const ptrdiff_t XQ_stride_g,
-        const ptrdiff_t XQ_stride_h,
-        const ptrdiff_t K_stride_b,
-        const ptrdiff_t K_stride_m,
-        const ptrdiff_t K_stride_g,
-        const ptrdiff_t K_stride_h,
-        const ptrdiff_t O_stride_split,
-        const int32_t Q_size_m,
-        const int32_t Q_size_g,
-        const int32_t Q_size_h,
-        const int32_t Q_size_k,
-        const int32_t K_size_m,
-        const bool multiquery,
-        const float qk_scale,
-        const int32_t split_k,
-        // launch params
-        const dim3 grid_dim,
-        const dim3 block_dim,
-        const size_t lds_bytes)
-        : XQ(XQ),
-          cache_K(cache_K),
-          cache_V(cache_V),
-          O(O),
-          split_O(split_O),
-          split_max(split_max),
-          split_sumexp(split_sumexp),
-          seq_kv_lens(seq_kv_lens),
-          XQ_stride_b(XQ_stride_b),
-          XQ_stride_m(XQ_stride_m),
-          XQ_stride_g(XQ_stride_g),
-          XQ_stride_h(XQ_stride_h),
-          K_stride_b(K_stride_b),
-          K_stride_m(K_stride_m),
-          K_stride_g(K_stride_g),
-          K_stride_h(K_stride_h),
-          O_stride_split(O_stride_split),
-          Q_size_m(Q_size_m),
-          Q_size_g(Q_size_g),
-          Q_size_h(Q_size_h),
-          Q_size_k(Q_size_k),
-          K_size_m(K_size_m),
-          multiquery(multiquery),
-          qk_scale(qk_scale),
-          split_k(split_k),
-          // launch params
-          grid_dim(grid_dim),
-          block_dim(block_dim),
-          lds_bytes(lds_bytes) {}
-
-      std::string str() const {
-        std::ostringstream oss;
-        oss << "Argument { " << std::endl <<
-        "    XQ: " << XQ << std::endl <<
-        "    cache_K: " << cache_K << std::endl <<
-        "    cache_V: " << cache_V << std::endl << 
-        "    O: " << O << std::endl <<
-        "    split_O: " << split_O << std::endl <<
-        "    split_max: " << split_max << std::endl <<
-        "    split_sumexp: " << split_sumexp << std::endl <<
-        "    seq_kv_lens: " << seq_kv_lens << std::endl <<
-        "    XQ_stride_b: " << XQ_stride_b << std::endl <<
-        "    XQ_stride_m: " << XQ_stride_m << std::endl <<
-        "    XQ_stride_g: " << XQ_stride_g << std::endl <<
-        "    XQ_stride_h: " << XQ_stride_h << std::endl <<
-        "    K_stride_b: " << K_stride_b << std::endl <<
-        "    K_stride_m: " << K_stride_m << std::endl <<
-        "    K_stride_g: " << K_stride_g << std::endl <<
-        "    K_stride_h: " << K_stride_h << std::endl <<
-        "    O_stride_split: " << O_stride_split << std::endl <<
-        "    Q_size_m: " << Q_size_m << std::endl <<
-        "    Q_size_g: " << Q_size_g << std::endl <<
-        "    Q_size_h: " << Q_size_h << std::endl <<
-        "    Q_size_k: " << Q_size_k << std::endl <<
-        "    K_size_m: " << K_size_m << std::endl <<
-        "    multiquery: " << multiquery << std::endl <<
-        "    qk_scale: " << qk_scale << std::endl <<
-        "    split_k: " << split_k << std::endl <<
-        std::endl <<
-        "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z << std::endl <<
-        "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z << std::endl <<
-        "    lds_bytes: " << lds_bytes << std::endl <<
-        "}";
-        return oss.str();
-      }
-  };
-
-  struct Invoker : public BaseInvoker {
-    using Argument = DeviceOp::Argument;
-    float Run(
-        const Argument& arg,
-        const StreamConfig& stream_config = StreamConfig{}) {
-
-      auto threads_per_wavefront = arg.block_dim.x;
-
-      auto Q_size_k_alignment_necessary = 0;
-
-      for (auto vec_size : {4, 2, 1}) {
-        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
-          Q_size_k_alignment_necessary = vec_size;
+struct FMHADecoderSplitKDeviceOp : public BaseOperator
+{
+    using DeviceOp = FMHADecoderSplitKDeviceOp;
+    struct Argument : public BaseArgument
+    {
+        const scalar_t* __restrict__ XQ;
+        const scalar_t* __restrict__ cache_K;
+        const scalar_t* __restrict__ cache_V;
+        scalar_t* __restrict__ O;
+        scalar_t* __restrict__ split_O;
+        compute_t* __restrict__ split_max;
+        compute_t* __restrict__ split_sumexp;
+        const int32_t* __restrict__ seq_kv_lens;
+        const ptrdiff_t XQ_stride_b;
+        const ptrdiff_t XQ_stride_m;
+        const ptrdiff_t XQ_stride_g;
+        const ptrdiff_t XQ_stride_h;
+        const ptrdiff_t K_stride_b;
+        const ptrdiff_t K_stride_m;
+        const ptrdiff_t K_stride_g;
+        const ptrdiff_t K_stride_h;
+        const ptrdiff_t O_stride_split;
+        const int32_t Q_size_m;
+        const int32_t Q_size_g;
+        const int32_t Q_size_h;
+        const int32_t Q_size_k;
+        const int32_t K_size_m;
+        const bool multiquery;
+        const float qk_scale;
+        const int32_t split_k;
+
+        const dim3 grid_dim;
+        const dim3 block_dim;
+        const size_t lds_bytes;
+
+        Argument(const scalar_t* __restrict__ XQ,
+                 const scalar_t* __restrict__ cache_K,
+                 const scalar_t* __restrict__ cache_V,
+                 scalar_t* __restrict__ O,
+                 scalar_t* __restrict__ split_O,
+                 compute_t* __restrict__ split_max,
+                 compute_t* __restrict__ split_sumexp,
+                 const int32_t* __restrict__ seq_kv_lens,
+                 const ptrdiff_t XQ_stride_b,
+                 const ptrdiff_t XQ_stride_m,
+                 const ptrdiff_t XQ_stride_g,
+                 const ptrdiff_t XQ_stride_h,
+                 const ptrdiff_t K_stride_b,
+                 const ptrdiff_t K_stride_m,
+                 const ptrdiff_t K_stride_g,
+                 const ptrdiff_t K_stride_h,
+                 const ptrdiff_t O_stride_split,
+                 const int32_t Q_size_m,
+                 const int32_t Q_size_g,
+                 const int32_t Q_size_h,
+                 const int32_t Q_size_k,
+                 const int32_t K_size_m,
+                 const bool multiquery,
+                 const float qk_scale,
+                 const int32_t split_k,
+                 // launch params
+                 const dim3 grid_dim,
+                 const dim3 block_dim,
+                 const size_t lds_bytes)
+            : XQ(XQ),
+              cache_K(cache_K),
+              cache_V(cache_V),
+              O(O),
+              split_O(split_O),
+              split_max(split_max),
+              split_sumexp(split_sumexp),
+              seq_kv_lens(seq_kv_lens),
+              XQ_stride_b(XQ_stride_b),
+              XQ_stride_m(XQ_stride_m),
+              XQ_stride_g(XQ_stride_g),
+              XQ_stride_h(XQ_stride_h),
+              K_stride_b(K_stride_b),
+              K_stride_m(K_stride_m),
+              K_stride_g(K_stride_g),
+              K_stride_h(K_stride_h),
+              O_stride_split(O_stride_split),
+              Q_size_m(Q_size_m),
+              Q_size_g(Q_size_g),
+              Q_size_h(Q_size_h),
+              Q_size_k(Q_size_k),
+              K_size_m(K_size_m),
+              multiquery(multiquery),
+              qk_scale(qk_scale),
+              split_k(split_k),
+              // launch params
+              grid_dim(grid_dim),
+              block_dim(block_dim),
+              lds_bytes(lds_bytes)
+        {
         }
-      }
-
-      if (!Q_size_k_alignment_necessary) {
-        throw std::runtime_error("Unsupported Q_size_k");
-      }
-
-      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
-        throw std::runtime_error("Unsupported alignment for Q_size_k");
-      }
-
-      float split_attention_result = launch_and_time_kernel(
-          stream_config,
-          Q_size_k_alignment_necessary == 4
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
-              : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
-              : nullptr,
-          arg.grid_dim,
-          arg.block_dim,
-          arg.lds_bytes,
-          arg.XQ,
-          arg.cache_K,
-          arg.cache_V,
-          arg.split_O,
-          arg.split_max,
-          arg.split_sumexp,
-          arg.seq_kv_lens,
-          arg.XQ_stride_b,
-          arg.XQ_stride_m,
-          arg.XQ_stride_g,
-          arg.XQ_stride_h,
-          arg.K_stride_b,
-          arg.K_stride_m,
-          arg.K_stride_g,
-          arg.K_stride_h,
-          arg.O_stride_split,
-          arg.Q_size_m,
-          arg.Q_size_g,
-          arg.Q_size_h,
-          arg.Q_size_k,
-          arg.K_size_m,
-          arg.multiquery,
-          arg.qk_scale,
-          arg.split_k);
-
-        const dim3 reduce_gridsize = {arg.grid_dim.x};
-        const dim3 reduce_blocksize = {arg.block_dim.x};
-        constexpr int32_t reduce_lds_bytes = 0;
-        float reduce_result = launch_and_time_kernel(
-          stream_config,
-          Q_size_k_alignment_necessary == 4
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
-              : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 1>
-              : nullptr,
-          reduce_gridsize,
-          reduce_blocksize,
-          reduce_lds_bytes, 
-          arg.split_O,
-          arg.split_max,
-          arg.split_sumexp,
-          arg.O,
-          arg.Q_size_m,
-          arg.Q_size_g,
-          arg.Q_size_h,
-          arg.Q_size_k,
-          arg.O_stride_split,
-          arg.XQ_stride_b,
-          arg.XQ_stride_m,
-          arg.XQ_stride_g,
-          arg.XQ_stride_h,
-          arg.split_k
-        );
-        return split_attention_result + reduce_result;
-    }
-  };
+
+        std::string str() const
+        {
+            std::ostringstream oss;
+            oss << "Argument { " << std::endl
+                << "    XQ: " << XQ << std::endl
+                << "    cache_K: " << cache_K << std::endl
+                << "    cache_V: " << cache_V << std::endl
+                << "    O: " << O << std::endl
+                << "    split_O: " << split_O << std::endl
+                << "    split_max: " << split_max << std::endl
+                << "    split_sumexp: " << split_sumexp << std::endl
+                << "    seq_kv_lens: " << seq_kv_lens << std::endl
+                << "    XQ_stride_b: " << XQ_stride_b << std::endl
+                << "    XQ_stride_m: " << XQ_stride_m << std::endl
+                << "    XQ_stride_g: " << XQ_stride_g << std::endl
+                << "    XQ_stride_h: " << XQ_stride_h << std::endl
+                << "    K_stride_b: " << K_stride_b << std::endl
+                << "    K_stride_m: " << K_stride_m << std::endl
+                << "    K_stride_g: " << K_stride_g << std::endl
+                << "    K_stride_h: " << K_stride_h << std::endl
+                << "    O_stride_split: " << O_stride_split << std::endl
+                << "    Q_size_m: " << Q_size_m << std::endl
+                << "    Q_size_g: " << Q_size_g << std::endl
+                << "    Q_size_h: " << Q_size_h << std::endl
+                << "    Q_size_k: " << Q_size_k << std::endl
+                << "    K_size_m: " << K_size_m << std::endl
+                << "    multiquery: " << multiquery << std::endl
+                << "    qk_scale: " << qk_scale << std::endl
+                << "    split_k: " << split_k << std::endl
+                << std::endl
+                << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z
+                << std::endl
+                << "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z
+                << std::endl
+                << "    lds_bytes: " << lds_bytes << std::endl
+                << "}";
+            return oss.str();
+        }
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+
+            auto threads_per_wavefront = arg.block_dim.x;
+
+            auto Q_size_k_alignment_necessary = 0;
+
+            for(auto vec_size : {4, 2, 1})
+            {
+                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                {
+                    Q_size_k_alignment_necessary = vec_size;
+                }
+            }
+
+            if(!Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported Q_size_k");
+            }
+
+            if(arg.Q_size_k % Q_size_k_alignment_necessary)
+            {
+                throw std::runtime_error("Unsupported alignment for Q_size_k");
+            }
+
+            float split_attention_result = launch_and_time_kernel(
+                stream_config,
+                Q_size_k_alignment_necessary == 4
+                    ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
+                    : Q_size_k_alignment_necessary == 2
+                          ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
+                          : Q_size_k_alignment_necessary == 1
+                                ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
+                                : nullptr,
+                arg.grid_dim,
+                arg.block_dim,
+                arg.lds_bytes,
+                arg.XQ,
+                arg.cache_K,
+                arg.cache_V,
+                arg.split_O,
+                arg.split_max,
+                arg.split_sumexp,
+                arg.seq_kv_lens,
+                arg.XQ_stride_b,
+                arg.XQ_stride_m,
+                arg.XQ_stride_g,
+                arg.XQ_stride_h,
+                arg.K_stride_b,
+                arg.K_stride_m,
+                arg.K_stride_g,
+                arg.K_stride_h,
+                arg.O_stride_split,
+                arg.Q_size_m,
+                arg.Q_size_g,
+                arg.Q_size_h,
+                arg.Q_size_k,
+                arg.K_size_m,
+                arg.multiquery,
+                arg.qk_scale,
+                arg.split_k);
+
+            const dim3 reduce_gridsize         = {arg.grid_dim.x};
+            const dim3 reduce_blocksize        = {arg.block_dim.x};
+            constexpr int32_t reduce_lds_bytes = 0;
+            float reduce_result                = launch_and_time_kernel(
+                stream_config,
+                Q_size_k_alignment_necessary == 4
+                    ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
+                    : Q_size_k_alignment_necessary == 2
+                          ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
+                          : Q_size_k_alignment_necessary == 1
+                                ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                                      scalar_t,
+                                      1>
+                                : nullptr,
+                reduce_gridsize,
+                reduce_blocksize,
+                reduce_lds_bytes,
+                arg.split_O,
+                arg.split_max,
+                arg.split_sumexp,
+                arg.O,
+                arg.Q_size_m,
+                arg.Q_size_g,
+                arg.Q_size_h,
+                arg.Q_size_k,
+                arg.O_stride_split,
+                arg.XQ_stride_b,
+                arg.XQ_stride_m,
+                arg.XQ_stride_g,
+                arg.XQ_stride_h,
+                arg.split_k);
+            return split_attention_result + reduce_result;
+        }
+    };
 };
 } // namespace device
 } // namespace tensor_operation

From 588b3a02d6d7b3bf96aefcf7efee01816e21d66e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 17:18:50 +0000
Subject: [PATCH 338/641] Enable support of attn-bias types with LocalAttention

---
 tests/test_forward_ck_tiled.py                | 2100 ++++++++++++++---
 tests/test_mqa_forward_ck_tiled.py            |  673 ++++++
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |   13 +-
 xformers/ops/fmha/ck.py                       |  163 +-
 4 files changed, 2602 insertions(+), 347 deletions(-)
 create mode 100644 tests/test_mqa_forward_ck_tiled.py

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index e2d6abc6f..a0685d88e 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -5,22 +5,26 @@
 
 import math
 import random
+from functools import partial
 from typing import List, Optional, Sequence, Tuple, Type, TypeVar
 
 import pytest
 import torch
+import torch.nn.functional as F
 from scipy.stats import binomtest
 from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
+from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
+from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
 from xformers.ops.fmha.common import AttentionOpBase
+from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
 
 from .utils import assert_allclose
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
 _types = [torch.float16, torch.bfloat16]
 
@@ -91,13 +95,14 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
     ]
     # Add some random shapes
     if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
+        fmha.cutlass.FwOp,
+        fmha.cutlass.BwOp,
+        fmha.flash.BwOp,
     ]:
         K_CHOICES = [8 * i for i in range(1, 256 // 8)]
         r = random.Random(0)
         found_count = 0
-        while found_count < 20:
+        while found_count < 200:
             B = r.randint(1, 400)
             Mq = r.randint(1, 500)
             Mkv = r.randint(1, 500)
@@ -146,10 +151,10 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
                         B, Mq, Mkv, H, K, Kv = shape
                         B = min(B, 12)
 
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
+                        if bias_type in {
+                            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+                            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                        }:
                             Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
                         elif (
                             bias_type
@@ -207,50 +212,40 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
 )
 
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
-    if q.ndim == 4:
-        B, M, Hq, K = q.shape
-        _, N, Hkv, Kv = v.shape
-        nhead_ratio_qk = Hq // Hkv
 
-        def attn_bias_head(head: int):
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 5:
+
+        def attn_bias_group(group: int):
             if isinstance(attn_bias, torch.Tensor):
-                assert attn_bias.ndim == 4
-                _, H, _, _ = attn_bias.shape        
-                assert H == Hq
-                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-                return bias_bghmn[:, :, head]
+                return attn_bias[:, group]
             if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                assert attn_bias._bias.ndim == 4
-                _, H, _, _ = attn_bias._bias.shape        
-                assert H == Hq
-                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-
                 return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    bias_bghmn[:, :, head]
+                    attn_bias._bias[:, group]
                 )
             return attn_bias
 
-        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
-
         return torch.stack(
             [
                 ref_attention_bmhk(
-                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    scale=scale,
+                    attn_bias=attn_bias_group(g),
                 )
-                for h in range(q_bmghk.shape[3])
+                for g in range(q.shape[2])
             ],
-            dim=3,
-        ).reshape((B, M, Hq, Kv))
-     
-    assert q.ndim == 3
-    if dtype is None:
-        dtype = torch.float32
-    q = q.to(dtype=dtype)
-    k = k.to(dtype=dtype)
-    v = v.to(dtype=dtype)
-
-    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+            dim=2,
+        )
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
     q = q * scale
 
     attn = q @ k.transpose(-2, -1)
@@ -260,23 +255,23 @@ def attn_bias_head(head: int):
             attn_bias_tensor = attn_bias.materialize(
                 (q.shape[0], 1, q.shape[1], k.shape[1]),
                 device=q.device,
-                dtype=dtype,
+                dtype=torch.float32,
             )
         else:
-            attn_bias_tensor = attn_bias.to(dtype=dtype)
+            attn_bias_tensor = attn_bias
         if attn_bias_tensor.ndim == 4:
             assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
             attn_bias_tensor = attn_bias_tensor.reshape(
                 [-1, *attn_bias_tensor.shape[2:]]
             )
-        attn = attn + attn_bias_tensor
+        attn = attn + attn_bias_tensor.float()
     attn = attn.softmax(-1)
     if drop_mask is not None:
         attn = attn * (drop_mask / (1 - p))
     return attn @ v
 
 
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -290,50 +285,11 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
 
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
     idx = {0, total}
@@ -343,158 +299,6 @@ def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     return [e - b for b, e in zip(s[:-1], s[1:])]
 
 
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        if fmt == "BMK":
-            attn_bias = attn_bias[:, 0]
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
 def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
     tensor_with_grad: Optional[torch.Tensor] = None
     if isinstance(attn_bias, torch.Tensor):
@@ -523,18 +327,46 @@ def create_tensors(
     *,
     attn_bias_requires_grad: bool = False,
     fmt: str = "BMK",
+    g: int = 1,
 ):
     torch.manual_seed(B * q_len + kv_len * k + kv)
+
+    mask_is_bottom_right = attn_bias_type is not None and issubclass(
+        attn_bias_type,
+        (
+            fmha.attn_bias.LowerTriangularFromBottomRightMask,
+            fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask,
+            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            fmha.attn_bias.LocalAttentionFromBottomRightMask,
+        ),
+    )
+    if mask_is_bottom_right and q_len > kv_len:
+        # Bottom-right attention and local-attention masks require q_len <= kv_len
+        kv_len = q_len
     scale = 3
     if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype)
+    elif fmt == "BMHK":
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype)
     else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+        assert fmt == "BMGHK"
+        query = torch.randn((B, q_len, g, h, k), device=device, dtype=dtype)
+        key = torch.randn((B, kv_len, g, 1, k), device=device, dtype=dtype)
+        value = torch.randn((B, kv_len, g, 1, kv), device=device, dtype=dtype)
+
+    for x in [query, key, value]:
+        x.mul_(scale)
+
+    if fmt == "BMGHK":
+        # Expand - after the in-place mul
+        key = key.expand((B, kv_len, g, h, k))
+        value = value.expand((B, kv_len, g, h, k))
 
     if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
         attn_bias_type = None
@@ -544,6 +376,7 @@ def create_tensors(
             attn_bias_type,
             batch_size=B,
             num_heads=h,
+            num_heads_groups=g,
             q_len=q_len,
             kv_len=kv_len,
             dtype=dtype,
@@ -590,11 +423,7 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed,
-    fmt,
-):
+def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs):
     (
         op,
         device,
@@ -618,12 +447,13 @@ def test_forward(
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
         )
-
     if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
         pytest.skip("BMK incompatible with this bias")
 
     query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK" if packed else fmt,
+        **kwargs,
     )
 
     if packed:
@@ -637,6 +467,7 @@ def test_forward(
                 bias_type=bias_type,
                 batch_size=batch_size,
                 num_heads=h,
+                num_heads_groups=1,
                 q_len=q_len,
                 kv_len=kv_len,
                 device=device,
@@ -645,9 +476,11 @@ def test_forward(
                 fmt=fmt,
                 op=op,
             )
-        else:
+        elif fmt == "BMHK":
             # bm3hk -> 3 x bmhk
             query, key, value = xformers.ops.unbind(c, 2)
+        else:
+            assert False, f"Unsupport fmt {fmt} with packing"
         assert not query.is_contiguous()
 
     out = xformers.ops.memory_efficient_attention_forward(
@@ -671,84 +504,1524 @@ def test_forward(
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
-@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
-@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
-@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
-@pytest.mark.parametrize("batches", [100, 64, 1])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
-@pytest.mark.parametrize("op", [fmha.ck.FwOp])
-def test_mqa_forward(
-    op,
-    attn_bias_type,
-    dtype, 
-    batches: int, 
-    seqlen_kv: int, 
-    seqlen_q: int, 
-    nhead_kv: int, 
-    nhead_q: int, 
-    hdim_v: int, 
-    hdim_k: int, 
+
+@cuda_only
+@pytest.mark.parametrize("k_len", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("kv_len", [128, 512])
+@pytest.mark.parametrize("q_len", [128, 512])
+@pytest.mark.parametrize("dtype", _types)
+def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
+    device = "cuda"
+    scale = 3
+    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
+    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
+    # this should be equivalent to the average over value
+    ref = value.mean(1, keepdim=True).expand_as(query)
+
+    if dtype is torch.float16:
+        assert_allclose(out, ref, atol=1e-5)
+    else:
+        assert_allclose(out, ref, atol=1e-2)
+
+def _block_diag_reshape_lse(
+    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
+) -> torch.Tensor:
+    """LSE can be padded, let's remove the padding"""
+    parts = []
+    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
+        parts.append(slice[:, : end - start])
+    return torch.cat(parts, dim=1).unsqueeze(1)
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+
+    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+        attn_bias=attn_bias,
+    )
+    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            tensor_bias = attn_bias.materialize(
+                (query.shape[0], 1, query.shape[1], key.shape[1]),
+                device=query.device,
+                dtype=torch.float32,
+            )
+        else:
+            assert isinstance(attn_bias, torch.Tensor)
+            tensor_bias = attn_bias
+        if tensor_bias.ndim == 4:
+            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
+        attn = attn + tensor_bias.float()
+    ref_lse = attn.logsumexp(-1)
+    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
+        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
+    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
+
+
+@cuda_only
+@pytest.mark.parametrize("op", [fmha.cutlass.FwOp, fmha.flash.FwOp])
+def test_logsumexp_mqa(op):
+    if not op.is_available():
+        pytest.skip("not available")
+
+    dtype = torch.float16
+    s = 3
+    query = torch.randn([1, 1, 32, 128], dtype=dtype, device="cuda") * s
+    key = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
+        -1, -1, 32, -1
+    )
+    value = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
+        -1, -1, 32, -1
+    )
+    assert key.stride(2) == 0
+
+    _, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+    )
+    query, key, value = [x[0].transpose(0, 1) for x in [query, key, value]]
+    attn = (query.float() / query.shape[-1] ** 0.5) @ key.float().transpose(-2, -1)
+    ref_lse = attn.logsumexp(-1)
+    assert_allclose(lse[0, :, 0], ref_lse[:, 0], atol=2e-4)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("grad_out_contiguous", [False, True])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_backward(
+    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    grad_out_contiguous,
+    fmt,
 ):
-    B = batches
-    M = seqlen_q
-    N = seqlen_kv
-    Hq = nhead_q
-    Hkv = nhead_kv
-    K = hdim_k
-    Kv = hdim_v
+    (
+        op_bw,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    print("Hq=", Hq, "Hkv=", Hkv)
+    ## ToDo: reopen bfloat16 for testing
+    if dtype is torch.bfloat16:
+        pytest.skip("Temporarily disabled bfloat16 as we are still improving the accuracy of the results")
 
-    device = torch.device("cuda")
+    if k > 128 or kv > 128:
+        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
 
-    if not (K == Kv and (Kv == 64 or Kv == 128)):
-        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
+    if k % 2 != 0:
+        pytest.skip("head-dim length must be an even value for CK-FlashAttention")    
 
-    if Kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+    if grad_out_contiguous is False:
+        pytest.skip("CK-FlashAttention requires grad_out and out have same lengths/strides")
 
-    scale = 3
-    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
-    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
-    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
+    attn_bias_requires_grad = (
+        random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
+    )
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        attn_bias_requires_grad=attn_bias_requires_grad,
+        fmt=fmt,
+    )
 
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=Hq,
-            q_len=M,
-            kv_len=N,
-            dtype=dtype,
-            device=device,
-            requires_grad=False,
-            fmt="BMHK",
-            op=op,
+    # To understand why we do this, check the comment on the
+    # `AttentionBwOpBase` class
+    scale = None
+    if op_bw.SUPPORTS_CUSTOM_SCALE and query.shape[-1] < 32:
+        scale = (1 / 32) ** 0.5
+    op_fw = (
+        sample_random_supported_fw(
+            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
+            seed=q_len * kv + kv_len * k,
         )
+        if op_bw != fmha.ck.BwOp
+        else fmha.ck.FwOp
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
 
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
 
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
+    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
+        pytest.skip("inputs not supported")
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, scale=scale, op=(op_fw, op_bw)
     )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
+
+    grad_out = torch.randn_like(out)
+    if grad_out_contiguous is False:
+        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
+            None, None, :
+        ].expand_as(out)
+
+    out.backward(grad_out)
+
+    if qkv is None and op_bw == fmha.cutlass.BwOp:
+        assert query.stride() == query.grad.stride()
+
+    grads = []
+    if qkv is None:
+        grads = [query.grad, key.grad, value.grad]
+        query.grad = None
+        key.grad = None
+        value.grad = None
+    else:
+        grads = [qkv.grad]
+        qkv.grad = None
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
+        if attn_bias_grad is not None:
+            grads.append(attn_bias_grad)
+
+    ref = ref_attention(query, key, value, attn_bias, scale=scale)
+    ref.backward(grad_out)
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        "fw pass",
+        atol=op_fw.ERROR_ATOL[dtype],
+        rtol=op_fw.ERROR_RTOL[dtype],
     )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
+
+    del out
+    del grad_out
+    del ref
+
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+
+    grads_ref = []
+    grads_name = []
+    if qkv is None:
+        assert isinstance(query.grad, torch.Tensor)
+        assert isinstance(key.grad, torch.Tensor)
+        assert isinstance(value.grad, torch.Tensor)
+        grads_ref = [query.grad, key.grad, value.grad]
+        grads_name = ["query", "key", "value"]
+    else:
+        assert isinstance(qkv.grad, torch.Tensor)
+        grads_ref = [qkv.grad]
+        grads_name = ["qkv"]
+
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias)
+        if attn_bias_grad is not None:
+            grads_ref.append(attn_bias.grad)
+            grads_name.append("bias")
+
+    del query
+    del key
+    del value
+    del qkv
+
+    assert len(grads_ref) == len(
+        grads
+    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
+    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
+        assert_allclose(
+            calc_grad,
+            ref_grad,
+            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+def _vec_binom_test(x, n, p):
+    """
+    vectorized implementation of scipy.stats.binom_test
+    this makes our tests much faster
+    reference: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_morestats.py#L2609-L2702
+    """
+    import numpy as np
+    from scipy.stats import distributions
+
+    x = np.atleast_1d(x)
+    d = distributions.binom.pmf(x, n, p)[:, None]
+    rerr = 1 + 1e-7
+    # x < p * n case
+    i = np.arange(np.ceil(p * n), n + 1)
+    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
+    pval1 = distributions.binom.cdf(x, n, p) + distributions.binom.sf(n - y, n, p)
+
+    # other case
+    i = np.arange(np.floor(p * n) + 1)
+    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
+    pval2 = distributions.binom.cdf(y - 1, n, p) + distributions.binom.sf(x - 1, n, p)
+
+    pval = np.where(x < p * n, pval1, pval2)
+    pval = np.minimum(1.0, pval)
+    return pval
+
+def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
+    if op == fmha.ck.FwOp:
+        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
+        ## rand_uniform is an int32 tensor
+        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
+        ##mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
+        mask = (rand_uniform <= int((1.0-p)*255.0)).to(torch.float32)
+        mask = mask.reshape(batch_size, q_len, kv_len)
+    else:
+        mask = torch.empty((batch_size, q_len, kv_len), device=device)
+        mask = torch.ops.xformers._temp_dropout(mask, p)
+
+    return mask
+
+@cuda_only
+@pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
+@pytest.mark.parametrize("seed", [42, 124])
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k_len", [32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
+@pytest.mark.parametrize("q_len", [2, 33])
+@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
+    device = "cuda"
+    scale = 0.05 
+    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+
+    inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
+    if not op.supports(inputs_for_support_check):
+        del query, key, value, attn_bias
+        pytest.skip(f"{op.NAME}: unsupported input")
+
+    torch.manual_seed(seed)
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, p, op=(op, None)
     )
 
+    torch.manual_seed(seed)
+    out2 = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, p, op=(op, None)
+    )
+
+    assert_allclose(out, out2, "dropout reproducibility")
+
+    torch.manual_seed(seed)
+    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+    ref = ref_attention(query, key, value, attn_bias, mask, p)
+    assert_allclose(out.float(), ref, atol=3e-3, rtol=5e-4), f"{(out - ref).abs().max()}"
+
+    num_trials = 1000
+    p_val_tol = 1e-6
+    keep_prob = 1 - p
+    masks = []
+    for i in range(num_trials):
+        mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+        masks.append(mask.clone().cpu())
+    masks = torch.stack(masks, dim=0)
+    p_value = binomtest(int(masks.sum()), masks.numel(), p=keep_prob).pvalue
+    assert p_value > p_val_tol, p_value
+    masks = masks.sum(0).flatten()
+    p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
+    assert all(p_values > p_val_tol)
+
+
+def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
+    if dtype is torch.bfloat16 and compute_capability < (8, 0):
+        pytest.skip("bf16 requires Sm80")
+    if not op.is_available():
+        pytest.skip()
+
+    scale = 3 
+    device = "cuda"
+    query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    grad_out = torch.ones_like(query)
+
+    assert op.supports(fmha.Inputs(query=query, key=key, value=value, p=p))
+
+    seed = 42
+    torch.manual_seed(seed)
+    out = xformers.ops.memory_efficient_attention(query, key, value, p=p, op=(op, None))
+
+    out.backward(grad_out)
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    torch.manual_seed(seed)
+    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+
+    ref = ref_attention(query, key, value, None, mask, p)
+    ref.backward(grad_out)
+
+    atol, rtol = (
+        fmha.AttentionBwOpBase.ERROR_ATOL[dtype],
+        fmha.AttentionBwOpBase.ERROR_RTOL[dtype],
+    )
+    assert_allclose(
+        grad_v,
+        value.grad,
+        "grad_v",
+        atol=atol,
+        rtol=rtol,
+    )
+    # TODO: Investigate why precision is worse
+    if dtype in [torch.float16, torch.bfloat16]:
+        atol = atol * 2 + 0.15
+        rtol = rtol * 2
+    assert_allclose(
+        grad_q,
+        query.grad,
+        "grad_q",
+        atol=atol,
+        rtol=rtol,
+    )
+    assert_allclose(
+        grad_k,
+        key.grad,
+        "grad_k",
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
+@pytest.mark.parametrize("q_len", [2, 33])
+def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
+@pytest.mark.parametrize("k", [16, 128, 256])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 248, 256])
+@pytest.mark.parametrize("q_len", [3, 248, 256])
+@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
+def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len,
+        kv_len,
+        batch_size,
+        k,
+        p,
+        op=fmha.cutlass.FwOp,
+        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("k_len", [32])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("kv_len", [3 * 32])
+@pytest.mark.parametrize("q_len", [3 * 32])
+def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size, k_len):
+    device = "cuda"
+    op_fw = fmha.small_k.FwOp
+    op_bw = fmha.small_k.BwOp
+
+    scale = 3
+    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    # in this case, most of the blocks in a row get masked
+    attn_bias = torch.full((3, 32), float("-inf"), device=device)
+    attn_bias[:2, :4] = 0
+    attn_bias = attn_bias.flatten()[None, None, :].expand(1, q_len, -1)
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, op=(op_fw, op_bw)
+    )
     ref = ref_attention(query, key, value, attn_bias)
+
+    assert_allclose(
+        out, ref, atol=op_fw.ERROR_ATOL[query.dtype], rtol=op_fw.ERROR_RTOL[query.dtype]
+    )
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    grad_out = torch.ones_like(query)
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, attn_bias)
+    out.backward(grad_out)
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    ref = ref_attention(query, key, value, attn_bias)
+    ref.backward(grad_out)
+
+    atol = op_bw.ERROR_ATOL[query.dtype]
+    rtol = op_bw.ERROR_RTOL[query.dtype]
+    assert_allclose(grad_q, query.grad, "grad_q", atol=atol, rtol=rtol)
+    assert_allclose(grad_k, key.grad, "grad_k", atol=atol, rtol=rtol)
+    assert_allclose(grad_v, value.grad, "grad_v", atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt=fmt
+    )
+    grad_out = torch.ones_like(query)
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query, key, value, attn_bias
+    )
+    assert out.ndim == query.ndim
+    dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
+        grad_out, out, lse, query, key, value, attn_bias
+    )
+    assert dq.shape == query.shape
+    assert dk.shape == key.shape
+    assert dv.shape == value.shape
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_cuda_streams(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    if device != "cuda":
+        pytest.skip("Not CUDA")
+    bias_type = None
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ]
+    s_hipri = torch.cuda.Stream(priority=-1)
+    s_lopri = torch.cuda.Stream(priority=0)
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK"
+    )
+    torch.cuda.synchronize()
+    with torch.cuda.stream(s_lopri):
+        torch.cuda._sleep(100_000_000)  # wait 100m cycles
+        query *= 2
+    s_hipri.wait_stream(s_lopri)
+    with torch.cuda.stream(s_hipri):
+        # If the kernel is scheduled in the main stream
+        # `query * 2` has not been executed yet
+        out = xformers.ops.memory_efficient_attention(query, key, value, op=(op, None))
+    # Test that `s_lopri` is still sleeping
+    # and that `query *= 2` has not been executed yet
+    query2_main_stream = query * 2
+    torch.cuda.synchronize()
+    # TODO: Figure out why this is failing sometimes
+    # The sleep timer seems to be high enough already ...
+    # assert torch.allclose(query2_main_stream, query), "Need to increase sleep time"
+    del query2_main_stream
+
+    ref = ref_attention(query, key, value)
     assert out.shape == ref.shape, out.shape
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    p = 0.0
+    scale = 0.1
+
+    (
+        op_bw,
+        device,
+        dtype,
+        _,
+        B,
+        q_len,
+        kv_len,
+        H,
+        k,
+        Kv,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    torch.manual_seed(q_len + kv_len + k)
+    if device != "cuda":
+        pytest.skip("Not CUDA")
+
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+    inputs = fmha.Inputs(
+        query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
+    )
+    op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
+    grad_out = query.new_ones(B * H, q_len, Kv)
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    reasons = op_fw.not_supported_reasons(inputs)
+    if reasons:
+        pytest.skip(f"{op_fw.NAME}: unsupported ({'/'.join(reasons)})")
+    reasons = op_bw.not_supported_reasons(inputs)
+    if reasons:
+        pytest.skip(f"{op_bw.NAME}: unsupported ({'/'.join(reasons)})")
+
+    # NOTE: we still need to scale the inputs to not blowup
+    # the pre-softmax values (numerical stability)
+    s = k**-0.5
+    out = xformers.ops.memory_efficient_attention(
+        query * s, key, value, attn_bias, p, scale, op=(op_fw, op_bw)
+    )
+    out.backward(grad_out)
+    grad_q, grad_k, grad_v = query.grad, key.grad, value.grad
+    query.grad = key.grad = value.grad = None
+
+    ref = ref_attention(query * s, key, value, attn_bias, None, p, scale)
+    ref.backward(grad_out)
+    ref_grad_q, ref_grad_k, ref_grad_v = query.grad, key.grad, value.grad
+    query.grad = key.grad = value.grad = None
+
+    atol = op_fw.ERROR_ATOL[dtype]
+    rtol = op_fw.ERROR_RTOL[dtype]
+    assert_allclose(out.float(), ref.float(), "out", atol=atol, rtol=rtol)
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+    assert_allclose(grad_q, ref_grad_q, "grad_q", atol=atol, rtol=rtol)
+    assert_allclose(grad_k, ref_grad_k, "grad_k", atol=atol, rtol=rtol)
+    assert_allclose(grad_v, ref_grad_v, "grad_v", atol=atol, rtol=rtol)
+
+
+def apply_attention(query, key, value, attn_bias, op_fw, proj):
+    x = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias=attn_bias, op=(op_fw, None)
+    )
+    x = proj(x)
+    return x
+
+
+@pytest.mark.parametrize("use_reentrant", [False, True])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_grad_checkpointing(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    use_reentrant,
+):
+    fmt = "BMHK"
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    bias_type = None
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    )
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt=fmt,
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    proj = torch.nn.Linear(kv, k, device=device, dtype=dtype)
+
+    x = query
+    for _ in range(5):
+        x = checkpoint(
+            apply_attention,
+            x,
+            key,
+            value,
+            attn_bias,
+            op,
+            proj,
+            use_reentrant=use_reentrant,
+        )
+    x.mean().backward()
+
+
+ALL_FW_OPS_NO_SMALLK = [op for op in ALL_FW_OPS if op is not fmha.small_k.FwOp]
+
+
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 1, 1, 32])
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
+        0, 3, 1, 2
+    )
+    try:
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+    except ValueError as e:
+        if "Only work on pre-MLIR triton for now" in str(e):
+            pytest.skip("Only work on pre-MLIR triton for now")
+        q = q.contiguous()
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
+    try:
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+    except ValueError as e:
+        if "Only work on pre-MLIR triton for now" in str(e):
+            pytest.skip("Only work on pre-MLIR triton for now")
+        q = q.contiguous()
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+def test_attn_bias_causal() -> None:
+    m = -math.inf
+    causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
+    tensor_bias = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
+
+    attn_bias = fmha.attn_bias.LowerTriangularMask()
+    assert_allclose(attn_bias.materialize(causal_mask.shape), causal_mask, "causal")
+    attn_bias = attn_bias.add_bias(tensor_bias)
+    assert_allclose(
+        attn_bias.materialize(causal_mask.shape),
+        tensor_bias + causal_mask,
+        "causal+tensor_bias",
+    )
+
+
+def test_attn_bias_torch_tensor() -> None:
+    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
+    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
+    m = -math.inf
+    causal_bias = torch.tensor([[0, m, m], [0, 0, m]])
+    assert_allclose(
+        attn_bias.materialize((2, 3)), causal_bias + tensor_bias, "tensor_bias+causal"
+    )
+
+
+def test_attn_bias_blockdiag() -> None:
+    queries = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([1, 5, 1, 8]),
+    ]
+    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((10, 10))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 2 * 2 + 5 * 5
+    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
+    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1")
+    assert_allclose(as_tensor[5:, 5:], torch.zeros([5, 5]), "batch2")
+
+    # Verify we can split it back
+    queries2 = attn_bias.split(q)
+    assert len(queries) == len(queries2)
+    for q1, q2 in zip(queries, queries2):
+        assert_allclose(q1, q2)
+
+
+def test_attn_bias_blockdiag_batched() -> None:
+    queries = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([3, 2, 1, 8]),
+        torch.randn([1, 5, 1, 8]),
+    ]
+    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((14, 14))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 3 * 2 * 2 + 5 * 5
+    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
+    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1.0")
+    assert_allclose(as_tensor[5:7, 5:7], torch.zeros([2, 2]), "batch1.1")
+    assert_allclose(as_tensor[7:9, 7:9], torch.zeros([2, 2]), "batch1.2")
+    assert_allclose(as_tensor[9:, 9:], torch.zeros([5, 5]), "batch2")
+
+    # Verify we can split it back
+    queries2 = attn_bias.split(q)
+    assert len(queries) == len(queries2)
+    for q1, q2 in zip(queries, queries2):
+        assert_allclose(q1, q2)
+
+
+def test_attn_bias_blockdiag_crossattn_causal() -> None:
+    # Q / KV have different seqlen
+    list_q = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([2, 1, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 3, 1, 8]),
+    ]
+
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((q.shape[1], k.shape[1]))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 2 + 2 * 3 * 1
+    assert_allclose(as_tensor[0:3, 0:2], torch.zeros([3, 2]), "batch0")
+    assert_allclose(as_tensor[3:4, 2:5], torch.zeros([1, 3]), "batch1.0")
+    assert_allclose(as_tensor[4:, 5:], torch.zeros([1, 3]), "batch1.1")
+
+    # Also test causal version
+    as_tensor = attn_bias.make_causal().materialize((q.shape[1], k.shape[1]))
+    assert_allclose(
+        as_tensor[3:4, 2:5],
+        fmha.attn_bias.LowerTriangularMask().materialize((1, 3)),
+        "batch1.0[causal]",
+    )
+
+    # Verify we can split it back
+    list_q2 = attn_bias.split_queries(q)
+    assert len(list_q) == len(list_q2)
+    for q1, q2 in zip(list_q, list_q2):
+        assert_allclose(q1, q2)
+    with pytest.raises(ValueError):
+        attn_bias.split_queries(k)
+    list_k2 = attn_bias.split_kv(k)
+    assert len(list_k) == len(list_k2)
+    for k1, k2 in zip(list_k, list_k2):
+        assert_allclose(k1, k2)
+
+
+def test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond() -> None:
+    list_q = [
+        torch.randn([1, 3, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+    ]
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+    with pytest.raises(ValueError):
+        attn_bias.make_causal_from_bottomright()
+
+
+def test_attn_bias_blockdiag_crossattn_causal_with_prefix() -> None:
+    # Q / KV have different seqlen
+    list_q = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 2, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 5, 1, 8]),
+    ]
+
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+    as_tensor = attn_bias.make_causal_from_bottomright().materialize(
+        (q.shape[1], k.shape[1])
+    )
+    m = -math.inf
+    assert_allclose(
+        as_tensor[0:2, 0:2],
+        torch.tensor([[0, m], [0, 0]], dtype=torch.float32),
+        "batch1.1[causal_with_prefix]",
+    )
+    assert_allclose(
+        as_tensor[2:4, 2:7],
+        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
+        "batch2.1[causal_with_prefix]",
+    )
+    assert_allclose(
+        as_tensor[4:6, 7:12],
+        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
+        "batch2.2[causal_with_prefix]",
+    )
+
+
+@cuda_only
+def test_attn_bias_padded() -> None:
+    bsize, n_heads, d, padding = 8, 3, 8, 32
+
+    # Q / KV have different seqlen
+    k = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
+    k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
+    other = bsize - 1
+    v = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
+    n_q_first = 4
+    q = [
+        torch.randn((1, n_q_first, n_heads, d), device="cuda", dtype=torch.float16),
+        torch.randn((1, other, n_heads, d), device="cuda", dtype=torch.float16),
+    ]
+    q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
+    q_seqlen = [n_q_first] + [1] * other
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=q_seqlen,
+        kv_seqlen=k_seqlen,
+        kv_padding=padding,
+    )
+
+    v = v.view(1, -1, n_heads, d)
+    k = k.view(1, -1, n_heads, d)
+
+    scores = (q_cat.transpose(1, 2) @ k.transpose(1, 2).transpose(2, 3)).float()
+    assert not scores.isnan().any()
+    mask = torch.full_like(scores, -float("inf"))
+    for i, (slen, qlen) in enumerate(zip(k_seqlen, q_seqlen)):
+        kseq_start = i * padding
+        qstart = sum(q_seqlen[:i])
+        mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen] = torch.triu(
+            mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen].float(),
+            diagonal=1 + slen - qlen,
+        ).float()
+
+    scores += mask
+    assert not scores.isnan().any()
+    # 1,3,10,8 @ 1,3,8,256 -> 1,3,10,256
+    scores = torch.nn.functional.softmax(scores, -1).half()
+    # torch.Size([1, 3, 3, 32]) @ torch.Size([1, 3, 32, 8])
+    output = scores @ v.transpose(1, 2)  # 1,3,10,256 @ 1,3,256, 8 -> 1,3,10,8
+    output = output.transpose(1, 2).contiguous()
+
+    fmha_output = fmha.memory_efficient_attention_forward(
+        q_cat, k, v, attn_bias, scale=1.0, op=fmha.ck.FwOp
+    )
+
+    # assert torch.allclose(output, fmha_output)
+    assert_allclose(
+        output,
+        fmha_output,
+        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
+        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
+    )
+
+
+def _kv_heads_label(kv_heads: Optional[int]) -> str:
+    if kv_heads is None:
+        return ""
+    if kv_heads == 1:
+        return "mq"
+    return f"gqa{kv_heads}"
+
+@pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
+@pytest.mark.parametrize("padding", [32, 4096])
+@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
+def test_decoder(
+    op,
+    n_heads: int,
+    kv_heads: Optional[int],
+    padding: int,
+    bsz: int,
+    dtype: str,
+    dequant: bool = False,
+    num_queries: int = 1,
+    d = 256,
+) -> None:
+    # kv_heads = 1: multiquery
+    # kv_heads = None: neither MQA nor GQA
+    # kv_heads > 1: BMGHK
+    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
+    tensor_options = {"dtype": dtype_, "device": "cuda"}
+    torch.manual_seed(1)
+    num_queries = 1
+    if kv_heads is not None and kv_heads > 1:
+        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
+        q_shape: Tuple[int, ...] = (
+            1,
+            bsz * num_queries,
+            kv_heads,
+            n_heads,
+            d,
+        )
+    else:
+        k_shape = (1, bsz * padding, n_heads, d)
+        q_shape = (1, bsz * num_queries, n_heads, d)
+
+    k = torch.randn(k_shape, **tensor_options)
+    k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
+    v = torch.randn_like(k)
+    q = torch.randn(q_shape, **tensor_options)
+    causal_diagonal = torch.tensor(  # TODO: make unnecessary
+        [i - 1 for i in k_seqlen], dtype=torch.int32
+    ).cuda()
+
+    if kv_heads is not None:
+        k = k[..., :1, :].expand(k_shape)
+        v = v[..., :1, :].expand(k_shape)
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=[num_queries] * bsz,
+        kv_seqlen=k_seqlen,
+        causal_diagonal=causal_diagonal,
+        kv_padding=padding,
+    )
+    inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
+    if (not_supported_reasons := op.not_supported_reasons(inp)):
+        pytest.skip(f"{not_supported_reasons=}")
+
+    decoder_output = fmha.memory_efficient_attention_forward(
+        q, k, v, attn_bias, op=op
+    )
+
+    ref_output = ref_attention(q, k, v, attn_bias)
+
+    assert_allclose(
+        decoder_output.float(),
+        ref_output,
+        atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
+        rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
+    )
+
+def test_attn_bias_from_seqlens() -> None:
+    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
+    out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
+    assert len(out) == 3
+    assert tuple(out[0].shape) == (1, 3, 16)
+
+
+@cuda_only
+def test_attn_bias_blockdiag_doc() -> None:
+    """IMPORTANT:
+    This is the example in the doc for `BlockDiagonalMask`.
+    If this example needs to be updated, please also update the doc
+    """
+    import torch
+
+    from xformers.ops import fmha
+
+    K = 16
+    dtype = torch.float16
+    device = "cuda"
+    list_x = [
+        torch.randn([1, 3, 1, K], dtype=dtype, device=device),
+        torch.randn([1, 6, 1, K], dtype=dtype, device=device),
+        torch.randn([1, 2, 1, K], dtype=dtype, device=device),
+    ]
+    attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
+
+    linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
+
+    q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
+    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None))
+    list_out = attn_bias.split(out)
+    assert tuple(list_out[0].shape) == (1, 3, 1, K)
+
+
+@cuda_only
+class TestAttnBias:
+    @staticmethod
+    def create_tensors(
+        dtype,
+        B: int = 2,
+        Mq: int = 32,
+        Mkv: int = 32,
+        H: int = 3,
+        K: int = 16,
+        Kv: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        return (
+            torch.randn([B, Mq, H, K], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, Mkv, H, K], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, Mkv, H, Kv], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, H, Mq, Mkv], device="cuda", dtype=dtype) * 3,
+        )
+
+    @staticmethod
+    def pad_bias(bias: torch.Tensor) -> torch.Tensor:
+        align_to = 16
+        if (bias.shape[-1] % align_to) == 0:
+            return bias
+        pad_count = align_to - (bias.shape[-1] % align_to)
+        return torch.nn.functional.pad(bias, [0, pad_count])[:, :, :, : bias.shape[-1]]
+
+    def test_f16_biasf32(self) -> None:
+        q, k, v, bias = self.create_tensors(torch.float16)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        bias = bias.to(torch.float32)
+        with pytest.raises((ValueError, RuntimeError)):
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+
+    def test_f32_biasf16(self) -> None:
+        q, k, v, bias = self.create_tensors(torch.float32)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        bias = bias.to(torch.float16)
+        with pytest.raises((ValueError, RuntimeError)):
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+    def test_wrong_alignment(self, dtype) -> None:
+        op = fmha.cutlass.FwOp
+        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
+        try:
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
+            return
+        except (ValueError, RuntimeError):
+            pass
+        # This case is not supported, likely due to padding issues
+        # Let's make sure it works with padding
+        assert bias.ndim == 4, bias.shape
+        bias_padded = self.pad_bias(bias)
+        out = fmha.memory_efficient_attention(
+            q, k, v, attn_bias=bias_padded, op=(op, None)
+        ).float()
+        ref_out = ref_attention_bmhk(q, k, v, bias)
+        assert_allclose(
+            out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
+        )
+
+    def test_permuted_attn_bias(self) -> None:
+        op = fmha.cutlass.FwOp
+        dtype = torch.float16
+        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
+        bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
+        # Either it works, or it raises an exception
+        # but we should never get a CUDA error
+        try:
+            out = fmha.memory_efficient_attention(
+                q, k, v, attn_bias=bias, op=(op, None)
+            ).float()
+            ref_out = ref_attention_bmhk(q, k, v, bias)
+            assert_allclose(
+                out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
+            )
+        except (ValueError, RuntimeError):
+            pass
+
+
+SM_AND_SHMEM_KBYTES = [
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+    (50, 64),
+    (60, 64),
+    (70, 96),
+    (75, 64),
+    (80, 163),
+    (86, 99),
+    (89, 99),
+    # (90, 227),
+]
+
+
+@cuda_only
+@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
+@pytest.mark.parametrize(
+    "sm_shmem",
+    SM_AND_SHMEM_KBYTES,
+    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
+)
+def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
+    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
+    sm, shmem_kbytes = sm_shmem
+    if sm < 80 and dtype_str == "bf16":
+        return
+
+    for k in [16, 32, 64, 128, 256]:
+        assert torch.ops.xformers._has_cutlassF_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+        assert torch.ops.xformers._has_cutlassB_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+
+
+def test_window_size_materialize() -> None:
+    seqlens = [4, 6]
+    attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+        q_seqlen=seqlens,
+        kv_seqlen=seqlens,
+    ).make_local_attention(2)
+    mask = attn_bias.materialize(
+        (1, 1, sum(seqlens), sum(seqlens)),
+        device="cpu",
+        dtype=torch.float32,
+    )
+    true_mask = torch.log(
+        torch.Tensor(
+            [
+                [
+                    [
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
+                    ]
+                ]
+            ]
+        )
+    )
+    assert torch.all(mask == true_mask)
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "opFW_biasT",
+    [
+        (op, biasT)
+        for op in ALL_FW_OPS
+        for biasT in op.SUPPORTED_ATTN_BIAS_TYPES
+        if op.SUPPORTS_BMGHK
+    ],
+)
+def test_forward_gqa(opFW_biasT):
+    opFW, biasT = opFW_biasT
+    B_Mq_Mkv_H_K_Kv = (3, 512, 512, 16, 128, 128)
+    test_forward(
+        (
+            opFW,
+            "cuda",
+            torch.float16,
+            biasT,
+            *B_Mq_Mkv_H_K_Kv,
+        ),
+        packed=False,
+        fmt="BMGHK",
+        g=2,
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "opBW",
+    [
+        fmha.flash.BwOp,
+        fmha.cutlass.BwOp,
+    ],
+)
+def test_backward_gqa(opBW):
+    H = 8
+    B_Mq_Mkv_H_K_Kv = (3, 512, 512, H, 128, 128)
+    dtype = torch.float16
+    query, key, value, attn_bias = create_tensors(
+        *(opBW, "cuda", dtype, type(None), *B_Mq_Mkv_H_K_Kv),
+        attn_bias_requires_grad=False,
+        fmt="BMHK",
+    )
+    op = (fmha.cutlass.FwOp, opBW)
+    key = key[:, :, :1].expand(-1, -1, H, -1)
+    value = value[:, :, :1].expand(-1, -1, H, -1)
+    key.requires_grad_(True)
+    out = fmha.memory_efficient_attention(query, key, value, attn_bias=attn_bias)
+    out_ref = ref_attention_bmhk(query, key, value, attn_bias=attn_bias)
+    assert_allclose(
+        out.float(),
+        out_ref.float(),
+        atol=op[0].ERROR_ATOL[dtype],
+        rtol=op[0].ERROR_RTOL[dtype],
+    )
+    out.backward(query)
+    dk = key.grad
+    key.grad = None
+    out_ref.backward(query)
+    assert_allclose(
+        dk.float(),
+        key.grad.float(),
+        atol=op[1].ERROR_ATOL[dtype],
+        rtol=op[1].ERROR_RTOL[dtype],
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("opFW", [op for op in ALL_FW_OPS if op.SUPPORTS_BMGHK])
+def test_forward_gqa_one_group(opFW):
+    dtype = torch.float16
+    B, Mq, Mkv, H, K = 3, 13, 16, 5, 128
+    q = torch.randn([B, Mq, 1, H, K], dtype=dtype, device="cuda") * 3
+    k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
+    v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
+
+    supported = opFW.supports(fmha.Inputs(q, k, v))
+    if not supported:
+        supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
+        assert supported == supported_bmhk
+        pytest.skip("not supported")
+    out = fmha.memory_efficient_attention_forward(q, k, v, op=opFW)
+    ref = ref_attention(q, k, v)
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=opFW.ERROR_ATOL[dtype],
+        rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+'''
+@sm80_or_better_only
+def test_flash_gqa_wrong_strides() -> None:
+    op = (fmha.flash.FwOp, None)
+    device = "cuda"
+    B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
+    q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
+    kv = torch.empty((B, Mkv, G, H, K), dtype=torch.float16, device=device)
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, H, G, K), dtype=torch.float16, device=device).permute(
+        0, 1, 3, 2, 4
+    )
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, G, 1, K), dtype=torch.float16, device=device)
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, kv, kv, op=op)
+    kv = kv.expand(-1, -1, -1, H, K)
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, G, H, 2 * K), dtype=torch.float16, device=device)[
+        :, :, :, :, :K
+    ]
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+'''
+
+def _dispatches_to_splitK(q, kv):
+    return (
+        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0]
+        is fmha.triton_splitk.FwOp
+    )
+
+
+def _dispatches_to_flash_decoding(q, kv):
+    return (
+        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
+    )
+
+
+def test_dispatch_decoding_bmhk() -> None:
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
+    ), "Should not use SplitK with 1 head (no tensorcores)"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 32, 128]),
+        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should use Flash-Decoding with BMHK MQA"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 32, 128]),
+        torch.empty([1, 2048, 32, 128]),
+    ), "Should not use SplitK when no TensorCores"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 128, 32, 128]),
+        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should not use SplitK if q seqlen is long"
+    assert not _dispatches_to_splitK(
+        torch.empty([128, 8, 32, 128]),
+        torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should not use SplitK if B is big"
+
+
+def test_dispatch_decoding_bmghk() -> None:
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
+    ), "Should not use SplitK with 1 head (no tensorcores)"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should use Flash-Decoding with MQA"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 4, 32, 128]),
+        torch.empty([1, 2048, 4, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should use Flash-Decoding with GQA"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 32, 128]),
+    ), "Should not use SplitK when no TensorCores"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 128, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should not use SplitK if q seqlen is long"
+    assert not _dispatches_to_splitK(
+        torch.empty([128, 8, 1, 32, 128]),
+        torch.empty([128, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should not use SplitK if B is big"
+
+
+shapes_triton_splitk = [
+    (1, 8, 2**16, 1, 128, 128),
+    (1, 4, 2**16, 1, 128, 128),
+    (1, 16, 2**16, 1, 128, 128),
+    (1, 16, 2**16, 1, 32, 32),
+    (1, 8, 1025, 1, 128, 128),
+    (2, 8, 4096, 1, 128, 128),
+    (10, 8, 2**16, 1, 128, 128),
+    (10, 15, 2**16, 1, 128, 128),
+    (1, 3, 2**16, 1, 128, 128),
+    (1, 3, 2**16 - 10, 1, 128, 128),
+    (2, 3, 73, 1, 128, 128),
+    (2, 7, 7328, 1, 128, 128),
+    (2, 7, 7328, 1, 120, 120),
+    (2, 7, 63, 1, 120, 120),
+]
+op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk = [
+    (fmha.triton_splitk.FwOp, "cuda", torch.float16, type(None), *s)
+    for s in shapes_triton_splitk
+] + [
+    (fmha.triton_splitk.FwOp, "cuda", torch.bfloat16, type(None), *s)
+    for s in shapes_triton_splitk
+]
+
+
+@pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk,
+    ids=[make_id(*c) for c in op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk],
+)
+@cuda_only
+def test_forward_splitk(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed=False,
+    fmt="BMHK",
+):
+    test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed=packed, fmt=fmt)
+
+
+@cuda_only
+@pytest.mark.parametrize("op", [fmha.triton_splitk.FwOp])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "B_Mkv_H_K",
+    [
+        (1, 2**16, 3, 128),
+        (5, 53, 4, 64),
+    ],
+)
+def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
+    B, Mkv, H, K = B_Mkv_H_K
+    q = torch.randn([B, 1, H, K], dtype=dtype, device="cuda") * 3
+    k = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
+    v = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
+    k = k.expand(-1, -1, H, -1)
+    v = v.expand(-1, -1, H, -1)
+
+    if not op.supports(fmha.Inputs(q, k, v)):
+        pytest.skip("not supported")
+    out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
+    ref = ref_attention(q, k, v)
     assert_allclose(
         out.float(),
         ref,
@@ -756,3 +2029,204 @@ def test_mqa_forward(
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_query(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    query = query[:, :0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    assert out.shape[1] == 0
+    out.backward(out)
+    # dK/dV should be all zeros
+    assert_allclose(key.grad, torch.zeros_like(key.grad), "key.grad")
+    assert_allclose(value.grad, torch.zeros_like(value.grad), "value.grad")
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_kv(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    key = key[:, :0]
+    value = value[:, :0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    assert_allclose(out, torch.zeros_like(out), "out")
+    out.backward(out)
+    # dQ should be all zeros
+    assert_allclose(query.grad, torch.zeros_like(query.grad), "query.grad")
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_b(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    query, key, value = query[:0], key[:0], value[:0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    out.backward(out)
+
+
+def test_local_attn_bias() -> None:
+    mask = (
+        fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
+        .materialize(shape=(4, 4))
+        .exp()
+    )
+
+    expected = torch.tensor(
+        [[1, 1, 1, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.float32
+    )
+    assert (mask == expected).all().item()
+
+
+@cuda_only
+@pytest.mark.parametrize("cc", [60, 70, 80])
+@pytest.mark.parametrize("maxK", [32, 64, 128, 256])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "custom_mask_type",
+    [
+        fmha.cutlass._CustomMaskType.NoCustomMask,
+        fmha.cutlass._CustomMaskType.CausalFromTopLeft,
+        fmha.cutlass._CustomMaskType.CausalFromBottomRight,
+    ],
+)
+@pytest.mark.parametrize("window_size", [0, 3, 300])
+@pytest.mark.parametrize(
+    "num_queries,num_keys",
+    [
+        (30, 66),
+        (256, 256),
+        # Edge cases
+        (314, 320),
+        (32, 256),
+        (224, 226),
+        (5, 531),
+        (320, 332),  # for win_size=300
+        # Others
+        (256, 62),
+        (256, 63),
+        (256, 64),
+        (256, 65),
+        (256, 66),
+    ],
+)
+def test_cutlassB_iter_order(
+    dtype,
+    cc: int,
+    maxK: int,
+    num_queries: int,
+    num_keys: int,
+    custom_mask_type,
+    window_size,
+) -> None:
+    """
+    This tests some internals of the cutlassB kernel
+    We test the iteration across blocks of [queries, keys] to ensure
+    that we correctly:
+    * Iterate over all the blocks that should be iterated
+    * Do *not* iterate over blocks that are completely masked out
+    * Correctly compute the number of parallel blocks that will compute
+        the same block of dQ
+    .. and we test this across variable causal masks+local attention combinations
+    """
+    if (
+        window_size > 0
+        and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
+    ):
+        pytest.skip("LocalAttention is only supported for causal")
+    get_iteration_data = partial(
+        torch.ops.xformers._cutlassB_iteration_data,
+        dtype=dtype,
+        cc=cc,
+        maxK=maxK,
+        num_queries=num_queries,
+        num_keys=num_keys,
+        custom_mask_type=custom_mask_type,
+        window_size=window_size,
+    )
+    bias = torch.zeros([num_queries, num_keys], dtype=torch.float32)
+    if custom_mask_type != fmha.cutlass._CustomMaskType.NoCustomMask:
+        bias = fmha.attn_bias._materialize_causal_mask(
+            (num_queries, num_keys),
+            dtype=torch.float32,
+            device="cpu",
+            window_size=None if window_size == 0 else window_size,
+            from_bottomright=(
+                custom_mask_type == fmha.cutlass._CustomMaskType.CausalFromBottomRight
+            ),
+        )
+
+    block_queries, block_keys = get_iteration_data()[:2]
+    mask_pooled = (
+        F.max_pool2d(bias.unsqueeze(0), (block_queries, block_keys), ceil_mode=True)
+        == 0
+    ).int()[0]
+    attn_computed = torch.zeros_like(mask_pooled)
+    for key_start in range(0, num_keys, block_keys):
+        it = 0
+        new_key_start = key_start
+        new_query_start = get_iteration_data(key_start=key_start)[2]
+        try:
+            expected_first_query = (
+                mask_pooled[:, key_start // block_keys].tolist().index(1)
+                * block_queries
+            )
+            assert (
+                new_query_start == expected_first_query
+            ), f"Wrong first query for K={key_start}: {new_query_start} (expected {expected_first_query})"
+        except ValueError:  # Nothing to compute in this column
+            pass
+
+        while new_key_start == key_start and new_query_start < num_queries:
+            query_start = new_query_start
+            attn_computed[query_start // block_queries, key_start // block_keys] += 1
+            # print(f"Compute [{query_start}, {key_start}]")
+
+            # Is there something to compute here?
+            assert mask_pooled[
+                query_start // block_queries, key_start // block_keys
+            ].item(), "Computing a block that is not needed!"
+            new_query_start, new_key_start = get_iteration_data(
+                key_start=key_start, query_start=query_start
+            )[3:5]
+            it += 1
+            assert it < num_queries, ""
+        assert (attn_computed == mask_pooled)[
+            :, key_start // block_keys
+        ].all(), "some blocks were not computed!"
+
+    # Now check that the number returned by `getNumParallelBlocksForQuery` is correct
+    for query_start in range(0, num_queries, block_queries):
+        num_parallel_blocks = get_iteration_data(
+            query_start=query_start, num_splits_key=num_keys
+        )[5]
+        num_actual = mask_pooled[query_start // block_queries].sum().item()
+        assert num_parallel_blocks == num_actual
+# end of file
diff --git a/tests/test_mqa_forward_ck_tiled.py b/tests/test_mqa_forward_ck_tiled.py
new file mode 100644
index 000000000..e3c1f488c
--- /dev/null
+++ b/tests/test_mqa_forward_ck_tiled.py
@@ -0,0 +1,673 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_types = [torch.float16, torch.bfloat16]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
+    fmha.ck.BwOp,
+]
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    for B in op._TEST_BATCH_SIZES:
+        for Mq in [32, 256]:
+            for Mkv in [32, 64, 256, 1024]:
+                for K in op._TEST_K:
+                    shapes.append((B, Mq, Mkv, 1, K, K))
+        Mq = 256
+        Mkv = 128
+        K = 32
+        H = 1
+        # Weird values of parameters
+        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
+            shapes.append((B, M, Mkv, H, K, K))
+            shapes.append((B, Mq, M, H, K, K))
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
+            if _K <= op.SUPPORTED_MAX_K:
+                shapes.append((B, Mq, Mkv, H, _K, _K))
+        # Different value for K / Kv
+        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+            for _K in [32, 36, 64, 256 + 8]:
+                shapes.append((B, Mq, Mkv, H, K, _K))
+                shapes.append((B, Mq, Mkv, H, _K, K))
+        # Exotic sizes
+        for K in op._TEST_K:
+            shapes.append((B, 16, 1024, H, K, K))
+            shapes.append((B, 1024, 16, H, K, K))
+        # Some number of heads
+        for H in [3, 5, 12]:
+            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Filter-out not supported shapes
+    shapes = [
+        shape
+        for shape in shapes
+        if len(
+            op.shape_not_supported_reasons(
+                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
+            )
+        )
+        == 0
+    ]
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        found_count = 0
+        while found_count < 20:
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
+                continue
+            found_count += 1
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def make_id(op, device, dtype, bias_type, *shape):
+    return (
+        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+        f"-{'-'.join([str(s) for s in shape])}"
+    )
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+    return {
+        "argvalues": combination,
+        "ids": [make_id(*c) for c in combination],
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
+)
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+    if q.ndim == 4:
+        B, M, Hq, K = q.shape
+        _, N, Hkv, Kv = v.shape
+        nhead_ratio_qk = Hq // Hkv
+
+        def attn_bias_head(head: int):
+            if isinstance(attn_bias, torch.Tensor):
+                assert attn_bias.ndim == 4
+                _, H, _, _ = attn_bias.shape        
+                assert H == Hq
+                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+                return bias_bghmn[:, :, head]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                assert attn_bias._bias.ndim == 4
+                _, H, _, _ = attn_bias._bias.shape        
+                assert H == Hq
+                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    bias_bghmn[:, :, head]
+                )
+            return attn_bias
+
+        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
+
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
+                )
+                for h in range(q_bmghk.shape[3])
+            ],
+            dim=3,
+        ).reshape((B, M, Hq, Kv))
+     
+    assert q.ndim == 3
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
+
+    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=dtype,
+            )
+        else:
+            attn_bias_tensor = attn_bias.to(dtype=dtype)
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
+            # make sure it also works if the first columns are partially masked out
+            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
+    tensor_with_grad: Optional[torch.Tensor] = None
+    if isinstance(attn_bias, torch.Tensor):
+        tensor_with_grad = attn_bias
+    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+        tensor_with_grad = attn_bias._bias
+    if tensor_with_grad is not None:
+        grad = tensor_with_grad.grad
+        if clear:
+            tensor_with_grad.grad = None
+        return grad
+    return None
+
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
+@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
+@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
+@pytest.mark.parametrize("batches", [100, 64, 1])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize("op", [fmha.ck.FwOp])
+def test_mqa_forward(
+    op,
+    attn_bias_type,
+    dtype, 
+    batches: int, 
+    seqlen_kv: int, 
+    seqlen_q: int, 
+    nhead_kv: int, 
+    nhead_q: int, 
+    hdim_v: int, 
+    hdim_k: int, 
+):
+    B = batches
+    M = seqlen_q
+    N = seqlen_kv
+    Hq = nhead_q
+    Hkv = nhead_kv
+    K = hdim_k
+    Kv = hdim_v
+
+    print("Hq=", Hq, "Hkv=", Hkv)
+
+    device = torch.device("cuda")
+
+    if not (K == Kv and (Kv == 64 or Kv == 128)):
+        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
+
+    if Kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+
+    scale = 3
+    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
+    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
+    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
+
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=Hq,
+            q_len=M,
+            kv_len=N,
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+            fmt="BMHK",
+            op=op,
+        )
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 94b36c235..856e64651 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -522,24 +522,21 @@ struct FmhaFwdKernel
                 {
                     if(kargs.mask_type == CausalMaskType::MaskDisabled)
                     {
-                        ck::index_t lr_size = kargs.window_size / 2;
+                        ck::index_t left_size  = kargs.window_size / 2;
+                        ck::index_t right_size = kargs.window_size - 1 - left_size;
 
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            lr_size, lr_size, kargs.seqlen_q, kargs.seqlen_k);
+                            left_size, right_size, kargs.seqlen_q, kargs.seqlen_k);
                     }
                     else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft)
                     {
-                        ck::index_t lr_size = kargs.window_size / 2;
-
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            lr_size, 0, kargs.seqlen_q, kargs.seqlen_k, true);
+                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, true);
                     }
                     else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromBottomRight)
                     {
-                        ck::index_t lr_size = kargs.window_size / 2;
-
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            lr_size, 0, kargs.seqlen_q, kargs.seqlen_k, false);
+                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, false);
                     }
                 }
                 else
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 3cb4ed014..67e71ccd6 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -4,8 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from dataclasses import replace
 from enum import Enum
-from typing import Any, List, Mapping, Optional, Set, Tuple, Union
+from functools import partial
+from typing import Any, List, Optional, Set, Tuple, Union, Mapping
 
 import torch
 
@@ -13,9 +15,13 @@
 from . import attn_bias
 from .attn_bias import (
     AttentionBias,
+    BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+    BlockDiagonalCausalLocalAttentionMask,
     BlockDiagonalCausalMask,
     BlockDiagonalCausalWithOffsetPaddedKeysMask,
     BlockDiagonalMask,
+    LowerTriangularFromBottomRightLocalAttentionMask,
+    LowerTriangularFromBottomRightMask,
     LowerTriangularMask,
     LowerTriangularMaskWithTensorBias,
 )
@@ -25,29 +31,34 @@
     Context,
     Gradients,
     Inputs,
+    _attn_bias_apply,
     check_lastdim_alignment_stride1,
 )
 
 def _minimum_gemm_alignment(inp: Inputs) -> int:
     return 1
 
-
 def _get_seqlen_info(
     inp: Inputs,
-) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int, int]:
     attn_bias = inp.attn_bias
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
+        ##attn_bias.k_seqinfo.to(inp.query.device)
+        ##attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
+        ##max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
     else:
         seqstart_k = None
         seqstart_q = None
         max_seqlen_q = -1
+        ##max_seqlen_k = -1
+
+    return seqstart_k, seqstart_q, max_seqlen_q,
 
-    return seqstart_k, seqstart_q, max_seqlen_q
 
 def _get_tensor_bias(
     attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
@@ -100,7 +111,6 @@ def _check_large_shapes(reasons: List[str], inp: Inputs) -> None:
             "Input is too large: product of first two dimensions of q/k/v must be < 2**20"
         )
 
-
 class _CustomMaskType(int, Enum):
     """
     (Matches CustomMaskType in C++.)
@@ -117,14 +127,18 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
         (
             LowerTriangularMask,
             BlockDiagonalCausalMask,
+            BlockDiagonalCausalLocalAttentionMask,
         ),
     ):
         return int(_CustomMaskType.CausalFromTopLeft)
     if isinstance(
         bias,
         (
+            LowerTriangularFromBottomRightMask,
+            LowerTriangularFromBottomRightLocalAttentionMask,
             attn_bias.BlockDiagonalCausalFromBottomRightMask,
             BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            BlockDiagonalCausalLocalAttentionFromBottomRightMask,
         ),
     ):
         return int(_CustomMaskType.CausalFromBottomRight)
@@ -134,26 +148,48 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
 @register_operator
 class FwOp(AttentionFwOpBase):
     """xFormers' MHA kernel based on Composable Kernel.
-    Supports AMD MI 200 and MI 300 GPUs
     """
 
+    ### ck_check_op is temporarily used to check ck-tiled availability
+    ck_check_op = get_xformers_operator("is_ck_tiled_used")
+    use_ck_tiled = ck_check_op()
+ 
     OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 65536
-    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        type(None),
-        torch.Tensor,
-        LowerTriangularMask,
-        LowerTriangularMaskWithTensorBias,
-        BlockDiagonalMask,
-        BlockDiagonalCausalMask,
-        BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    }
+    
+    if use_ck_tiled: 
+        SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+            type(None),
+            torch.Tensor,
+            LowerTriangularMask,
+            LowerTriangularFromBottomRightMask,
+            LowerTriangularFromBottomRightLocalAttentionMask,
+            LowerTriangularMaskWithTensorBias,
+            BlockDiagonalMask,
+            BlockDiagonalCausalMask,
+            BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+        }
+    else:
+        SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+            type(None),
+            torch.Tensor,
+            LowerTriangularMask,
+            LowerTriangularMaskWithTensorBias,
+            BlockDiagonalMask,
+            BlockDiagonalCausalMask,
+            BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            attn_bias.BlockDiagonalCausalFromBottomRightMask,
+         }
+
     SUPPORTS_DROPOUT = True
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
+    SUPPORTS_BMGHK = True
     NAME = "ckF"
 
     ERROR_ATOL: Mapping[torch.dtype, float] = {
@@ -176,6 +212,70 @@ class FwOp(AttentionFwOpBase):
     @classmethod
     def apply(
         cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
+            raise NotImplementedError("Unsupported attn_bias type")
+        if inp.query.ndim in [3, 4]:
+            return cls.apply_bmhk(inp, needs_gradient=needs_gradient)
+        assert inp.query.ndim == 5, f"query has shape {inp.query.shape}"
+        ctx: Optional[Context] = None
+        # XXX: Hackfix for BMGHK with H=1
+        # In that case we don't want to run G different streams because it adds
+        # some overhead
+        if inp.query.ndim == 5 and inp.query.shape[3] == 1:
+            slice_op = partial(torch.squeeze, dim=3)
+            inp = replace(
+                inp,
+                query=slice_op(inp.query),
+                key=slice_op(inp.key),
+                value=slice_op(inp.value),
+                attn_bias=_attn_bias_apply(
+                    inp.attn_bias, partial(torch.squeeze, dim=2)
+                ),
+            )
+            out, ctx = cls.apply_bmhk(inp, needs_gradient=needs_gradient)
+            out = out.unsqueeze(3)
+            if ctx is not None:
+                ctx = replace(ctx, lse=ctx.lse.unsqueeze(1), out=out)
+            return out, ctx
+
+        # Workaround until this is properly implemented in C++
+        # run each head group in a different stream
+        n_groups = inp.key.shape[2]
+        main_stream = torch.cuda.current_stream()
+        streams = [main_stream] + [
+            torch.cuda.Stream(device=inp.query.device) for _ in range(n_groups - 1)
+        ]
+        outs = []
+        for group, stream in enumerate(streams):
+            stream.wait_stream(main_stream)
+            with torch.cuda.stream(stream):
+                query = inp.query[:, :, group]
+                key = inp.key[:, :, group]
+                value = inp.value[:, :, group]
+                bias = _attn_bias_apply(
+                    inp.attn_bias, partial(torch.select, dim=1, index=group)
+                )
+                outs.append(
+                    cls.apply_bmhk(
+                        replace(inp, query=query, key=key, value=value, attn_bias=bias),
+                        needs_gradient=needs_gradient,
+                    )
+                )
+        for s in streams[1:]:
+            main_stream.wait_stream(s)
+        out = torch.stack([o[0] for o in outs], dim=2)
+        if needs_gradient:
+            ctx = Context(
+                out=out,
+                lse=torch.stack([o[1].lse for o in outs], dim=1),  # type: ignore
+                op_bw=outs[0][1].op_bw,  # type: ignore
+            )
+        return out, ctx
+
+    @classmethod
+    def apply_bmhk(
+        cls, inp: Inputs, needs_gradient: bool
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
@@ -195,8 +295,18 @@ def apply(
             seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
-            window_size=0,
+            window_size=inp.attn_bias._window_size
+            if isinstance(
+                inp.attn_bias,
+                (
+                    BlockDiagonalCausalLocalAttentionMask,
+                    BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                    LowerTriangularFromBottomRightLocalAttentionMask,
+                ),
+            )
+            else None,
         )
+
         ctx: Optional[Context] = None
         if needs_gradient:
             ctx = Context(
@@ -233,6 +343,7 @@ def operator_flop(
         b,
         seqstart_q,
         seqstart_k,
+        max_seqlen_q_,
         compute_lse,
         custom_mask_type,
         *a,
@@ -259,11 +370,16 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
+        LowerTriangularFromBottomRightMask,
+        # TODO: Still some infs/nans in the BW pass for
+        # local + causal
+        # LowerTriangularFromBottomRightLocalAttentionMask,
         # TODO: Fix handling of gradient through the fMHA autograd function
         # LowerTriangularMaskWithTensorBias,
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        attn_bias.BlockDiagonalCausalLocalAttentionMask,
     }
     SUPPORTS_ATTN_BIAS_GRAD = True
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
@@ -271,14 +387,6 @@ class BwOp(AttentionBwOpBase):
     SUPPORTS_DIFFERENT_VALUE_EMBED = FwOp.SUPPORTS_DIFFERENT_VALUE_EMBED
     NAME = "ckB"
 
-    ERROR_ATOL: Mapping[torch.dtype, float] = {
-        torch.float: 5e-4,
-        # increased from 9e-2, more opportunities for numerical errors when bias is
-        # used, noticed in gK on SM80
-        torch.half: 1e-1,
-        torch.bfloat16: 7e-1,
-    }
-
     _TEST_K: List[int] = [
         32,  # 64x64 kernel
         128,  # 64x128/128x128 kernel
@@ -323,7 +431,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
 
-        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q, max_seqlen_k = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
         rng_seed = rng_offset = 0
@@ -361,6 +469,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
         )
+        
 
         # c++/CUDA implementation returns an uninitialized tensor if bias doesn't
         # require grad
@@ -382,6 +491,8 @@ def operator_flop(
         b,
         cu_seqlens_q,
         cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
         logsumexp,
         output,
         dropout_p,

From 04cf84bfdb840a0241cd3bd1e6bfe46b742b0104 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 17:18:50 +0000
Subject: [PATCH 339/641] Enable support of attn-bias types with LocalAttention

---
 tests/test_forward_ck_tiled.py                | 2100 ++++++++++++++---
 tests/test_mqa_forward_ck_tiled.py            |  673 ++++++
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |   13 +-
 xformers/ops/fmha/ck.py                       |  163 +-
 4 files changed, 2602 insertions(+), 347 deletions(-)
 create mode 100644 tests/test_mqa_forward_ck_tiled.py

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index e2d6abc6f..a0685d88e 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -5,22 +5,26 @@
 
 import math
 import random
+from functools import partial
 from typing import List, Optional, Sequence, Tuple, Type, TypeVar
 
 import pytest
 import torch
+import torch.nn.functional as F
 from scipy.stats import binomtest
 from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
+from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
+from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
 from xformers.ops.fmha.common import AttentionOpBase
+from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
 
 from .utils import assert_allclose
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
 _types = [torch.float16, torch.bfloat16]
 
@@ -91,13 +95,14 @@ def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
     ]
     # Add some random shapes
     if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
+        fmha.cutlass.FwOp,
+        fmha.cutlass.BwOp,
+        fmha.flash.BwOp,
     ]:
         K_CHOICES = [8 * i for i in range(1, 256 // 8)]
         r = random.Random(0)
         found_count = 0
-        while found_count < 20:
+        while found_count < 200:
             B = r.randint(1, 400)
             Mq = r.randint(1, 500)
             Mkv = r.randint(1, 500)
@@ -146,10 +151,10 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
                         B, Mq, Mkv, H, K, Kv = shape
                         B = min(B, 12)
 
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
+                        if bias_type in {
+                            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+                            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                        }:
                             Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
                         elif (
                             bias_type
@@ -207,50 +212,40 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
 )
 
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
-    if q.ndim == 4:
-        B, M, Hq, K = q.shape
-        _, N, Hkv, Kv = v.shape
-        nhead_ratio_qk = Hq // Hkv
 
-        def attn_bias_head(head: int):
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    if q.ndim == 5:
+
+        def attn_bias_group(group: int):
             if isinstance(attn_bias, torch.Tensor):
-                assert attn_bias.ndim == 4
-                _, H, _, _ = attn_bias.shape        
-                assert H == Hq
-                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-                return bias_bghmn[:, :, head]
+                return attn_bias[:, group]
             if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                assert attn_bias._bias.ndim == 4
-                _, H, _, _ = attn_bias._bias.shape        
-                assert H == Hq
-                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-
                 return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    bias_bghmn[:, :, head]
+                    attn_bias._bias[:, group]
                 )
             return attn_bias
 
-        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
-
         return torch.stack(
             [
                 ref_attention_bmhk(
-                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    scale=scale,
+                    attn_bias=attn_bias_group(g),
                 )
-                for h in range(q_bmghk.shape[3])
+                for g in range(q.shape[2])
             ],
-            dim=3,
-        ).reshape((B, M, Hq, Kv))
-     
-    assert q.ndim == 3
-    if dtype is None:
-        dtype = torch.float32
-    q = q.to(dtype=dtype)
-    k = k.to(dtype=dtype)
-    v = v.to(dtype=dtype)
-
-    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+            dim=2,
+        )
+    if q.ndim == 4:
+        assert p == 0.0
+        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
     q = q * scale
 
     attn = q @ k.transpose(-2, -1)
@@ -260,23 +255,23 @@ def attn_bias_head(head: int):
             attn_bias_tensor = attn_bias.materialize(
                 (q.shape[0], 1, q.shape[1], k.shape[1]),
                 device=q.device,
-                dtype=dtype,
+                dtype=torch.float32,
             )
         else:
-            attn_bias_tensor = attn_bias.to(dtype=dtype)
+            attn_bias_tensor = attn_bias
         if attn_bias_tensor.ndim == 4:
             assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
             attn_bias_tensor = attn_bias_tensor.reshape(
                 [-1, *attn_bias_tensor.shape[2:]]
             )
-        attn = attn + attn_bias_tensor
+        attn = attn + attn_bias_tensor.float()
     attn = attn.softmax(-1)
     if drop_mask is not None:
         attn = attn * (drop_mask / (1 - p))
     return attn @ v
 
 
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -290,50 +285,11 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
 
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
     idx = {0, total}
@@ -343,158 +299,6 @@ def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     return [e - b for b, e in zip(s[:-1], s[1:])]
 
 
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        if fmt == "BMK":
-            attn_bias = attn_bias[:, 0]
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
 def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
     tensor_with_grad: Optional[torch.Tensor] = None
     if isinstance(attn_bias, torch.Tensor):
@@ -523,18 +327,46 @@ def create_tensors(
     *,
     attn_bias_requires_grad: bool = False,
     fmt: str = "BMK",
+    g: int = 1,
 ):
     torch.manual_seed(B * q_len + kv_len * k + kv)
+
+    mask_is_bottom_right = attn_bias_type is not None and issubclass(
+        attn_bias_type,
+        (
+            fmha.attn_bias.LowerTriangularFromBottomRightMask,
+            fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask,
+            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            fmha.attn_bias.LocalAttentionFromBottomRightMask,
+        ),
+    )
+    if mask_is_bottom_right and q_len > kv_len:
+        # Bottom-right attention and local-attention masks require q_len <= kv_len
+        kv_len = q_len
     scale = 3
     if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype)
+    elif fmt == "BMHK":
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype)
     else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+        assert fmt == "BMGHK"
+        query = torch.randn((B, q_len, g, h, k), device=device, dtype=dtype)
+        key = torch.randn((B, kv_len, g, 1, k), device=device, dtype=dtype)
+        value = torch.randn((B, kv_len, g, 1, kv), device=device, dtype=dtype)
+
+    for x in [query, key, value]:
+        x.mul_(scale)
+
+    if fmt == "BMGHK":
+        # Expand - after the in-place mul
+        key = key.expand((B, kv_len, g, h, k))
+        value = value.expand((B, kv_len, g, h, k))
 
     if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
         attn_bias_type = None
@@ -544,6 +376,7 @@ def create_tensors(
             attn_bias_type,
             batch_size=B,
             num_heads=h,
+            num_heads_groups=g,
             q_len=q_len,
             kv_len=kv_len,
             dtype=dtype,
@@ -590,11 +423,7 @@ def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
 @pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
 @pytest.mark.parametrize("packed", [False, True])
 @parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed,
-    fmt,
-):
+def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs):
     (
         op,
         device,
@@ -618,12 +447,13 @@ def test_forward(
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
         )
-
     if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
         pytest.skip("BMK incompatible with this bias")
 
     query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK" if packed else fmt,
+        **kwargs,
     )
 
     if packed:
@@ -637,6 +467,7 @@ def test_forward(
                 bias_type=bias_type,
                 batch_size=batch_size,
                 num_heads=h,
+                num_heads_groups=1,
                 q_len=q_len,
                 kv_len=kv_len,
                 device=device,
@@ -645,9 +476,11 @@ def test_forward(
                 fmt=fmt,
                 op=op,
             )
-        else:
+        elif fmt == "BMHK":
             # bm3hk -> 3 x bmhk
             query, key, value = xformers.ops.unbind(c, 2)
+        else:
+            assert False, f"Unsupport fmt {fmt} with packing"
         assert not query.is_contiguous()
 
     out = xformers.ops.memory_efficient_attention_forward(
@@ -671,84 +504,1524 @@ def test_forward(
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
-@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
-@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
-@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
-@pytest.mark.parametrize("batches", [100, 64, 1])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
-@pytest.mark.parametrize("op", [fmha.ck.FwOp])
-def test_mqa_forward(
-    op,
-    attn_bias_type,
-    dtype, 
-    batches: int, 
-    seqlen_kv: int, 
-    seqlen_q: int, 
-    nhead_kv: int, 
-    nhead_q: int, 
-    hdim_v: int, 
-    hdim_k: int, 
+
+@cuda_only
+@pytest.mark.parametrize("k_len", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("kv_len", [128, 512])
+@pytest.mark.parametrize("q_len", [128, 512])
+@pytest.mark.parametrize("dtype", _types)
+def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
+    device = "cuda"
+    scale = 3
+    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
+    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
+    # this should be equivalent to the average over value
+    ref = value.mean(1, keepdim=True).expand_as(query)
+
+    if dtype is torch.float16:
+        assert_allclose(out, ref, atol=1e-5)
+    else:
+        assert_allclose(out, ref, atol=1e-2)
+
+def _block_diag_reshape_lse(
+    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
+) -> torch.Tensor:
+    """LSE can be padded, let's remove the padding"""
+    parts = []
+    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
+        parts.append(slice[:, : end - start])
+    return torch.cat(parts, dim=1).unsqueeze(1)
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+
+    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+        attn_bias=attn_bias,
+    )
+    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            tensor_bias = attn_bias.materialize(
+                (query.shape[0], 1, query.shape[1], key.shape[1]),
+                device=query.device,
+                dtype=torch.float32,
+            )
+        else:
+            assert isinstance(attn_bias, torch.Tensor)
+            tensor_bias = attn_bias
+        if tensor_bias.ndim == 4:
+            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
+        attn = attn + tensor_bias.float()
+    ref_lse = attn.logsumexp(-1)
+    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
+        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
+    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
+
+
+@cuda_only
+@pytest.mark.parametrize("op", [fmha.cutlass.FwOp, fmha.flash.FwOp])
+def test_logsumexp_mqa(op):
+    if not op.is_available():
+        pytest.skip("not available")
+
+    dtype = torch.float16
+    s = 3
+    query = torch.randn([1, 1, 32, 128], dtype=dtype, device="cuda") * s
+    key = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
+        -1, -1, 32, -1
+    )
+    value = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
+        -1, -1, 32, -1
+    )
+    assert key.stride(2) == 0
+
+    _, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query,
+        key,
+        value,
+        op=op,
+    )
+    query, key, value = [x[0].transpose(0, 1) for x in [query, key, value]]
+    attn = (query.float() / query.shape[-1] ** 0.5) @ key.float().transpose(-2, -1)
+    ref_lse = attn.logsumexp(-1)
+    assert_allclose(lse[0, :, 0], ref_lse[:, 0], atol=2e-4)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@pytest.mark.parametrize("grad_out_contiguous", [False, True])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+def test_backward(
+    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    grad_out_contiguous,
+    fmt,
 ):
-    B = batches
-    M = seqlen_q
-    N = seqlen_kv
-    Hq = nhead_q
-    Hkv = nhead_kv
-    K = hdim_k
-    Kv = hdim_v
+    (
+        op_bw,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    print("Hq=", Hq, "Hkv=", Hkv)
+    ## ToDo: reopen bfloat16 for testing
+    if dtype is torch.bfloat16:
+        pytest.skip("Temporarily disabled bfloat16 as we are still improving the accuracy of the results")
 
-    device = torch.device("cuda")
+    if k > 128 or kv > 128:
+        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
 
-    if not (K == Kv and (Kv == 64 or Kv == 128)):
-        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
+    if k % 2 != 0:
+        pytest.skip("head-dim length must be an even value for CK-FlashAttention")    
 
-    if Kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+    if grad_out_contiguous is False:
+        pytest.skip("CK-FlashAttention requires grad_out and out have same lengths/strides")
 
-    scale = 3
-    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
-    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
-    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
+    attn_bias_requires_grad = (
+        random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
+    )
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        attn_bias_requires_grad=attn_bias_requires_grad,
+        fmt=fmt,
+    )
 
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=Hq,
-            q_len=M,
-            kv_len=N,
-            dtype=dtype,
-            device=device,
-            requires_grad=False,
-            fmt="BMHK",
-            op=op,
+    # To understand why we do this, check the comment on the
+    # `AttentionBwOpBase` class
+    scale = None
+    if op_bw.SUPPORTS_CUSTOM_SCALE and query.shape[-1] < 32:
+        scale = (1 / 32) ** 0.5
+    op_fw = (
+        sample_random_supported_fw(
+            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
+            seed=q_len * kv + kv_len * k,
         )
+        if op_bw != fmha.ck.BwOp
+        else fmha.ck.FwOp
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
 
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
 
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
+    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
+        pytest.skip("inputs not supported")
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, scale=scale, op=(op_fw, op_bw)
     )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
+
+    grad_out = torch.randn_like(out)
+    if grad_out_contiguous is False:
+        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
+            None, None, :
+        ].expand_as(out)
+
+    out.backward(grad_out)
+
+    if qkv is None and op_bw == fmha.cutlass.BwOp:
+        assert query.stride() == query.grad.stride()
+
+    grads = []
+    if qkv is None:
+        grads = [query.grad, key.grad, value.grad]
+        query.grad = None
+        key.grad = None
+        value.grad = None
+    else:
+        grads = [qkv.grad]
+        qkv.grad = None
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
+        if attn_bias_grad is not None:
+            grads.append(attn_bias_grad)
+
+    ref = ref_attention(query, key, value, attn_bias, scale=scale)
+    ref.backward(grad_out)
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        "fw pass",
+        atol=op_fw.ERROR_ATOL[dtype],
+        rtol=op_fw.ERROR_RTOL[dtype],
     )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
+
+    del out
+    del grad_out
+    del ref
+
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+
+    grads_ref = []
+    grads_name = []
+    if qkv is None:
+        assert isinstance(query.grad, torch.Tensor)
+        assert isinstance(key.grad, torch.Tensor)
+        assert isinstance(value.grad, torch.Tensor)
+        grads_ref = [query.grad, key.grad, value.grad]
+        grads_name = ["query", "key", "value"]
+    else:
+        assert isinstance(qkv.grad, torch.Tensor)
+        grads_ref = [qkv.grad]
+        grads_name = ["qkv"]
+
+    if attn_bias_requires_grad:
+        attn_bias_grad = get_bias_grad(attn_bias)
+        if attn_bias_grad is not None:
+            grads_ref.append(attn_bias.grad)
+            grads_name.append("bias")
+
+    del query
+    del key
+    del value
+    del qkv
+
+    assert len(grads_ref) == len(
+        grads
+    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
+    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
+        assert_allclose(
+            calc_grad,
+            ref_grad,
+            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+def _vec_binom_test(x, n, p):
+    """
+    vectorized implementation of scipy.stats.binom_test
+    this makes our tests much faster
+    reference: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_morestats.py#L2609-L2702
+    """
+    import numpy as np
+    from scipy.stats import distributions
+
+    x = np.atleast_1d(x)
+    d = distributions.binom.pmf(x, n, p)[:, None]
+    rerr = 1 + 1e-7
+    # x < p * n case
+    i = np.arange(np.ceil(p * n), n + 1)
+    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
+    pval1 = distributions.binom.cdf(x, n, p) + distributions.binom.sf(n - y, n, p)
+
+    # other case
+    i = np.arange(np.floor(p * n) + 1)
+    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
+    pval2 = distributions.binom.cdf(y - 1, n, p) + distributions.binom.sf(x - 1, n, p)
+
+    pval = np.where(x < p * n, pval1, pval2)
+    pval = np.minimum(1.0, pval)
+    return pval
+
+def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
+    if op == fmha.ck.FwOp:
+        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
+        ## rand_uniform is an int32 tensor
+        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
+        ##mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
+        mask = (rand_uniform <= int((1.0-p)*255.0)).to(torch.float32)
+        mask = mask.reshape(batch_size, q_len, kv_len)
+    else:
+        mask = torch.empty((batch_size, q_len, kv_len), device=device)
+        mask = torch.ops.xformers._temp_dropout(mask, p)
+
+    return mask
+
+@cuda_only
+@pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
+@pytest.mark.parametrize("seed", [42, 124])
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k_len", [32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
+@pytest.mark.parametrize("q_len", [2, 33])
+@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
+    device = "cuda"
+    scale = 0.05 
+    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+
+    inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
+    if not op.supports(inputs_for_support_check):
+        del query, key, value, attn_bias
+        pytest.skip(f"{op.NAME}: unsupported input")
+
+    torch.manual_seed(seed)
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, p, op=(op, None)
     )
 
+    torch.manual_seed(seed)
+    out2 = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, p, op=(op, None)
+    )
+
+    assert_allclose(out, out2, "dropout reproducibility")
+
+    torch.manual_seed(seed)
+    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+    ref = ref_attention(query, key, value, attn_bias, mask, p)
+    assert_allclose(out.float(), ref, atol=3e-3, rtol=5e-4), f"{(out - ref).abs().max()}"
+
+    num_trials = 1000
+    p_val_tol = 1e-6
+    keep_prob = 1 - p
+    masks = []
+    for i in range(num_trials):
+        mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+        masks.append(mask.clone().cpu())
+    masks = torch.stack(masks, dim=0)
+    p_value = binomtest(int(masks.sum()), masks.numel(), p=keep_prob).pvalue
+    assert p_value > p_val_tol, p_value
+    masks = masks.sum(0).flatten()
+    p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
+    assert all(p_values > p_val_tol)
+
+
+def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
+    if dtype is torch.bfloat16 and compute_capability < (8, 0):
+        pytest.skip("bf16 requires Sm80")
+    if not op.is_available():
+        pytest.skip()
+
+    scale = 3 
+    device = "cuda"
+    query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    grad_out = torch.ones_like(query)
+
+    assert op.supports(fmha.Inputs(query=query, key=key, value=value, p=p))
+
+    seed = 42
+    torch.manual_seed(seed)
+    out = xformers.ops.memory_efficient_attention(query, key, value, p=p, op=(op, None))
+
+    out.backward(grad_out)
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    torch.manual_seed(seed)
+    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
+
+    ref = ref_attention(query, key, value, None, mask, p)
+    ref.backward(grad_out)
+
+    atol, rtol = (
+        fmha.AttentionBwOpBase.ERROR_ATOL[dtype],
+        fmha.AttentionBwOpBase.ERROR_RTOL[dtype],
+    )
+    assert_allclose(
+        grad_v,
+        value.grad,
+        "grad_v",
+        atol=atol,
+        rtol=rtol,
+    )
+    # TODO: Investigate why precision is worse
+    if dtype in [torch.float16, torch.bfloat16]:
+        atol = atol * 2 + 0.15
+        rtol = rtol * 2
+    assert_allclose(
+        grad_q,
+        query.grad,
+        "grad_q",
+        atol=atol,
+        rtol=rtol,
+    )
+    assert_allclose(
+        grad_k,
+        key.grad,
+        "grad_k",
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("p", [0.3, 0.7])
+@pytest.mark.parametrize("k", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
+@pytest.mark.parametrize("q_len", [2, 33])
+def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
+@pytest.mark.parametrize("k", [16, 128, 256])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 248, 256])
+@pytest.mark.parametrize("q_len", [3, 248, 256])
+@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
+def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len,
+        kv_len,
+        batch_size,
+        k,
+        p,
+        op=fmha.cutlass.FwOp,
+        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("k_len", [32])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("kv_len", [3 * 32])
+@pytest.mark.parametrize("q_len", [3 * 32])
+def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size, k_len):
+    device = "cuda"
+    op_fw = fmha.small_k.FwOp
+    op_bw = fmha.small_k.BwOp
+
+    scale = 3
+    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    # in this case, most of the blocks in a row get masked
+    attn_bias = torch.full((3, 32), float("-inf"), device=device)
+    attn_bias[:2, :4] = 0
+    attn_bias = attn_bias.flatten()[None, None, :].expand(1, q_len, -1)
+
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias, op=(op_fw, op_bw)
+    )
     ref = ref_attention(query, key, value, attn_bias)
+
+    assert_allclose(
+        out, ref, atol=op_fw.ERROR_ATOL[query.dtype], rtol=op_fw.ERROR_RTOL[query.dtype]
+    )
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    grad_out = torch.ones_like(query)
+
+    out = xformers.ops.memory_efficient_attention(query, key, value, attn_bias)
+    out.backward(grad_out)
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    ref = ref_attention(query, key, value, attn_bias)
+    ref.backward(grad_out)
+
+    atol = op_bw.ERROR_ATOL[query.dtype]
+    rtol = op_bw.ERROR_RTOL[query.dtype]
+    assert_allclose(grad_q, query.grad, "grad_q", atol=atol, rtol=rtol)
+    assert_allclose(grad_k, key.grad, "grad_k", atol=atol, rtol=rtol)
+    assert_allclose(grad_v, value.grad, "grad_v", atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt=fmt
+    )
+    grad_out = torch.ones_like(query)
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
+        query, key, value, attn_bias
+    )
+    assert out.ndim == query.ndim
+    dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
+        grad_out, out, lse, query, key, value, attn_bias
+    )
+    assert dq.shape == query.shape
+    assert dk.shape == key.shape
+    assert dv.shape == value.shape
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_cuda_streams(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    if device != "cuda":
+        pytest.skip("Not CUDA")
+    bias_type = None
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ]
+    s_hipri = torch.cuda.Stream(priority=-1)
+    s_lopri = torch.cuda.Stream(priority=0)
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK"
+    )
+    torch.cuda.synchronize()
+    with torch.cuda.stream(s_lopri):
+        torch.cuda._sleep(100_000_000)  # wait 100m cycles
+        query *= 2
+    s_hipri.wait_stream(s_lopri)
+    with torch.cuda.stream(s_hipri):
+        # If the kernel is scheduled in the main stream
+        # `query * 2` has not been executed yet
+        out = xformers.ops.memory_efficient_attention(query, key, value, op=(op, None))
+    # Test that `s_lopri` is still sleeping
+    # and that `query *= 2` has not been executed yet
+    query2_main_stream = query * 2
+    torch.cuda.synchronize()
+    # TODO: Figure out why this is failing sometimes
+    # The sleep timer seems to be high enough already ...
+    # assert torch.allclose(query2_main_stream, query), "Need to increase sleep time"
+    del query2_main_stream
+
+    ref = ref_attention(query, key, value)
     assert out.shape == ref.shape, out.shape
+
+    assert_allclose(
+        out.float(),
+        ref.float(),
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+
+@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
+    p = 0.0
+    scale = 0.1
+
+    (
+        op_bw,
+        device,
+        dtype,
+        _,
+        B,
+        q_len,
+        kv_len,
+        H,
+        k,
+        Kv,
+    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    torch.manual_seed(q_len + kv_len + k)
+    if device != "cuda":
+        pytest.skip("Not CUDA")
+
+    query, key, value, attn_bias = create_tensors(
+        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
+    )
+    inputs = fmha.Inputs(
+        query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
+    )
+    op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
+    grad_out = query.new_ones(B * H, q_len, Kv)
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    reasons = op_fw.not_supported_reasons(inputs)
+    if reasons:
+        pytest.skip(f"{op_fw.NAME}: unsupported ({'/'.join(reasons)})")
+    reasons = op_bw.not_supported_reasons(inputs)
+    if reasons:
+        pytest.skip(f"{op_bw.NAME}: unsupported ({'/'.join(reasons)})")
+
+    # NOTE: we still need to scale the inputs to not blowup
+    # the pre-softmax values (numerical stability)
+    s = k**-0.5
+    out = xformers.ops.memory_efficient_attention(
+        query * s, key, value, attn_bias, p, scale, op=(op_fw, op_bw)
+    )
+    out.backward(grad_out)
+    grad_q, grad_k, grad_v = query.grad, key.grad, value.grad
+    query.grad = key.grad = value.grad = None
+
+    ref = ref_attention(query * s, key, value, attn_bias, None, p, scale)
+    ref.backward(grad_out)
+    ref_grad_q, ref_grad_k, ref_grad_v = query.grad, key.grad, value.grad
+    query.grad = key.grad = value.grad = None
+
+    atol = op_fw.ERROR_ATOL[dtype]
+    rtol = op_fw.ERROR_RTOL[dtype]
+    assert_allclose(out.float(), ref.float(), "out", atol=atol, rtol=rtol)
+    atol = op_bw.ERROR_ATOL[dtype]
+    rtol = op_bw.ERROR_RTOL[dtype]
+    assert_allclose(grad_q, ref_grad_q, "grad_q", atol=atol, rtol=rtol)
+    assert_allclose(grad_k, ref_grad_k, "grad_k", atol=atol, rtol=rtol)
+    assert_allclose(grad_v, ref_grad_v, "grad_v", atol=atol, rtol=rtol)
+
+
+def apply_attention(query, key, value, attn_bias, op_fw, proj):
+    x = xformers.ops.memory_efficient_attention(
+        query, key, value, attn_bias=attn_bias, op=(op_fw, None)
+    )
+    x = proj(x)
+    return x
+
+
+@pytest.mark.parametrize("use_reentrant", [False, True])
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_grad_checkpointing(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    use_reentrant,
+):
+    fmt = "BMHK"
+    (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+    bias_type = None
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
+        op,
+        device,
+        dtype,
+        bias_type,
+        batch_size,
+        q_len,
+        kv_len,
+        h,
+        k,
+        kv,
+    )
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt=fmt,
+    )
+    qkv = None
+
+    if (
+        fmt == "BMHK"
+        and query.shape[3] == value.shape[3]
+        and query.shape[1] == value.shape[1]
+    ):
+        qkv = torch.stack([query, key, value], 2)
+        qkv.requires_grad_(True)
+        # bm3hk -> 3 x bmhk
+        query, key, value = xformers.ops.unbind(qkv, 2)
+        assert not query.is_contiguous()
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    proj = torch.nn.Linear(kv, k, device=device, dtype=dtype)
+
+    x = query
+    for _ in range(5):
+        x = checkpoint(
+            apply_attention,
+            x,
+            key,
+            value,
+            attn_bias,
+            op,
+            proj,
+            use_reentrant=use_reentrant,
+        )
+    x.mean().backward()
+
+
+ALL_FW_OPS_NO_SMALLK = [op for op in ALL_FW_OPS if op is not fmha.small_k.FwOp]
+
+
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 1, 1, 32])
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
+        0, 3, 1, 2
+    )
+    try:
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+    except ValueError as e:
+        if "Only work on pre-MLIR triton for now" in str(e):
+            pytest.skip("Only work on pre-MLIR triton for now")
+        q = q.contiguous()
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
+)
+def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
+    q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
+    try:
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+    except ValueError as e:
+        if "Only work on pre-MLIR triton for now" in str(e):
+            pytest.skip("Only work on pre-MLIR triton for now")
+        q = q.contiguous()
+        fmha.memory_efficient_attention(q, q, q, op=(op, None))
+
+def test_attn_bias_causal() -> None:
+    m = -math.inf
+    causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
+    tensor_bias = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
+
+    attn_bias = fmha.attn_bias.LowerTriangularMask()
+    assert_allclose(attn_bias.materialize(causal_mask.shape), causal_mask, "causal")
+    attn_bias = attn_bias.add_bias(tensor_bias)
+    assert_allclose(
+        attn_bias.materialize(causal_mask.shape),
+        tensor_bias + causal_mask,
+        "causal+tensor_bias",
+    )
+
+
+def test_attn_bias_torch_tensor() -> None:
+    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
+    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
+    m = -math.inf
+    causal_bias = torch.tensor([[0, m, m], [0, 0, m]])
+    assert_allclose(
+        attn_bias.materialize((2, 3)), causal_bias + tensor_bias, "tensor_bias+causal"
+    )
+
+
+def test_attn_bias_blockdiag() -> None:
+    queries = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([1, 5, 1, 8]),
+    ]
+    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((10, 10))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 2 * 2 + 5 * 5
+    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
+    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1")
+    assert_allclose(as_tensor[5:, 5:], torch.zeros([5, 5]), "batch2")
+
+    # Verify we can split it back
+    queries2 = attn_bias.split(q)
+    assert len(queries) == len(queries2)
+    for q1, q2 in zip(queries, queries2):
+        assert_allclose(q1, q2)
+
+
+def test_attn_bias_blockdiag_batched() -> None:
+    queries = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([3, 2, 1, 8]),
+        torch.randn([1, 5, 1, 8]),
+    ]
+    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((14, 14))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 3 * 2 * 2 + 5 * 5
+    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
+    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1.0")
+    assert_allclose(as_tensor[5:7, 5:7], torch.zeros([2, 2]), "batch1.1")
+    assert_allclose(as_tensor[7:9, 7:9], torch.zeros([2, 2]), "batch1.2")
+    assert_allclose(as_tensor[9:, 9:], torch.zeros([5, 5]), "batch2")
+
+    # Verify we can split it back
+    queries2 = attn_bias.split(q)
+    assert len(queries) == len(queries2)
+    for q1, q2 in zip(queries, queries2):
+        assert_allclose(q1, q2)
+
+
+def test_attn_bias_blockdiag_crossattn_causal() -> None:
+    # Q / KV have different seqlen
+    list_q = [
+        torch.randn([1, 3, 1, 8]),
+        torch.randn([2, 1, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 3, 1, 8]),
+    ]
+
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+
+    # Verify mask
+    as_tensor = attn_bias.materialize((q.shape[1], k.shape[1]))
+    assert int((as_tensor != -math.inf).sum().item()) == 3 * 2 + 2 * 3 * 1
+    assert_allclose(as_tensor[0:3, 0:2], torch.zeros([3, 2]), "batch0")
+    assert_allclose(as_tensor[3:4, 2:5], torch.zeros([1, 3]), "batch1.0")
+    assert_allclose(as_tensor[4:, 5:], torch.zeros([1, 3]), "batch1.1")
+
+    # Also test causal version
+    as_tensor = attn_bias.make_causal().materialize((q.shape[1], k.shape[1]))
+    assert_allclose(
+        as_tensor[3:4, 2:5],
+        fmha.attn_bias.LowerTriangularMask().materialize((1, 3)),
+        "batch1.0[causal]",
+    )
+
+    # Verify we can split it back
+    list_q2 = attn_bias.split_queries(q)
+    assert len(list_q) == len(list_q2)
+    for q1, q2 in zip(list_q, list_q2):
+        assert_allclose(q1, q2)
+    with pytest.raises(ValueError):
+        attn_bias.split_queries(k)
+    list_k2 = attn_bias.split_kv(k)
+    assert len(list_k) == len(list_k2)
+    for k1, k2 in zip(list_k, list_k2):
+        assert_allclose(k1, k2)
+
+
+def test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond() -> None:
+    list_q = [
+        torch.randn([1, 3, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+    ]
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+    with pytest.raises(ValueError):
+        attn_bias.make_causal_from_bottomright()
+
+
+def test_attn_bias_blockdiag_crossattn_causal_with_prefix() -> None:
+    # Q / KV have different seqlen
+    list_q = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 2, 1, 8]),
+    ]
+    list_k = [
+        torch.randn([1, 2, 1, 8]),
+        torch.randn([2, 5, 1, 8]),
+    ]
+
+    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
+        list_q, list_k
+    )
+    as_tensor = attn_bias.make_causal_from_bottomright().materialize(
+        (q.shape[1], k.shape[1])
+    )
+    m = -math.inf
+    assert_allclose(
+        as_tensor[0:2, 0:2],
+        torch.tensor([[0, m], [0, 0]], dtype=torch.float32),
+        "batch1.1[causal_with_prefix]",
+    )
+    assert_allclose(
+        as_tensor[2:4, 2:7],
+        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
+        "batch2.1[causal_with_prefix]",
+    )
+    assert_allclose(
+        as_tensor[4:6, 7:12],
+        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
+        "batch2.2[causal_with_prefix]",
+    )
+
+
+@cuda_only
+def test_attn_bias_padded() -> None:
+    bsize, n_heads, d, padding = 8, 3, 8, 32
+
+    # Q / KV have different seqlen
+    k = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
+    k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
+    other = bsize - 1
+    v = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
+    n_q_first = 4
+    q = [
+        torch.randn((1, n_q_first, n_heads, d), device="cuda", dtype=torch.float16),
+        torch.randn((1, other, n_heads, d), device="cuda", dtype=torch.float16),
+    ]
+    q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
+    q_seqlen = [n_q_first] + [1] * other
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=q_seqlen,
+        kv_seqlen=k_seqlen,
+        kv_padding=padding,
+    )
+
+    v = v.view(1, -1, n_heads, d)
+    k = k.view(1, -1, n_heads, d)
+
+    scores = (q_cat.transpose(1, 2) @ k.transpose(1, 2).transpose(2, 3)).float()
+    assert not scores.isnan().any()
+    mask = torch.full_like(scores, -float("inf"))
+    for i, (slen, qlen) in enumerate(zip(k_seqlen, q_seqlen)):
+        kseq_start = i * padding
+        qstart = sum(q_seqlen[:i])
+        mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen] = torch.triu(
+            mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen].float(),
+            diagonal=1 + slen - qlen,
+        ).float()
+
+    scores += mask
+    assert not scores.isnan().any()
+    # 1,3,10,8 @ 1,3,8,256 -> 1,3,10,256
+    scores = torch.nn.functional.softmax(scores, -1).half()
+    # torch.Size([1, 3, 3, 32]) @ torch.Size([1, 3, 32, 8])
+    output = scores @ v.transpose(1, 2)  # 1,3,10,256 @ 1,3,256, 8 -> 1,3,10,8
+    output = output.transpose(1, 2).contiguous()
+
+    fmha_output = fmha.memory_efficient_attention_forward(
+        q_cat, k, v, attn_bias, scale=1.0, op=fmha.ck.FwOp
+    )
+
+    # assert torch.allclose(output, fmha_output)
+    assert_allclose(
+        output,
+        fmha_output,
+        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
+        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
+    )
+
+
+def _kv_heads_label(kv_heads: Optional[int]) -> str:
+    if kv_heads is None:
+        return ""
+    if kv_heads == 1:
+        return "mq"
+    return f"gqa{kv_heads}"
+
+@pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
+@pytest.mark.parametrize("padding", [32, 4096])
+@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
+def test_decoder(
+    op,
+    n_heads: int,
+    kv_heads: Optional[int],
+    padding: int,
+    bsz: int,
+    dtype: str,
+    dequant: bool = False,
+    num_queries: int = 1,
+    d = 256,
+) -> None:
+    # kv_heads = 1: multiquery
+    # kv_heads = None: neither MQA nor GQA
+    # kv_heads > 1: BMGHK
+    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
+    tensor_options = {"dtype": dtype_, "device": "cuda"}
+    torch.manual_seed(1)
+    num_queries = 1
+    if kv_heads is not None and kv_heads > 1:
+        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
+        q_shape: Tuple[int, ...] = (
+            1,
+            bsz * num_queries,
+            kv_heads,
+            n_heads,
+            d,
+        )
+    else:
+        k_shape = (1, bsz * padding, n_heads, d)
+        q_shape = (1, bsz * num_queries, n_heads, d)
+
+    k = torch.randn(k_shape, **tensor_options)
+    k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
+    v = torch.randn_like(k)
+    q = torch.randn(q_shape, **tensor_options)
+    causal_diagonal = torch.tensor(  # TODO: make unnecessary
+        [i - 1 for i in k_seqlen], dtype=torch.int32
+    ).cuda()
+
+    if kv_heads is not None:
+        k = k[..., :1, :].expand(k_shape)
+        v = v[..., :1, :].expand(k_shape)
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=[num_queries] * bsz,
+        kv_seqlen=k_seqlen,
+        causal_diagonal=causal_diagonal,
+        kv_padding=padding,
+    )
+    inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
+    if (not_supported_reasons := op.not_supported_reasons(inp)):
+        pytest.skip(f"{not_supported_reasons=}")
+
+    decoder_output = fmha.memory_efficient_attention_forward(
+        q, k, v, attn_bias, op=op
+    )
+
+    ref_output = ref_attention(q, k, v, attn_bias)
+
+    assert_allclose(
+        decoder_output.float(),
+        ref_output,
+        atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
+        rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
+    )
+
+def test_attn_bias_from_seqlens() -> None:
+    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
+    out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
+    assert len(out) == 3
+    assert tuple(out[0].shape) == (1, 3, 16)
+
+
+@cuda_only
+def test_attn_bias_blockdiag_doc() -> None:
+    """IMPORTANT:
+    This is the example in the doc for `BlockDiagonalMask`.
+    If this example needs to be updated, please also update the doc
+    """
+    import torch
+
+    from xformers.ops import fmha
+
+    K = 16
+    dtype = torch.float16
+    device = "cuda"
+    list_x = [
+        torch.randn([1, 3, 1, K], dtype=dtype, device=device),
+        torch.randn([1, 6, 1, K], dtype=dtype, device=device),
+        torch.randn([1, 2, 1, K], dtype=dtype, device=device),
+    ]
+    attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
+
+    linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
+
+    q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
+    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None))
+    list_out = attn_bias.split(out)
+    assert tuple(list_out[0].shape) == (1, 3, 1, K)
+
+
+@cuda_only
+class TestAttnBias:
+    @staticmethod
+    def create_tensors(
+        dtype,
+        B: int = 2,
+        Mq: int = 32,
+        Mkv: int = 32,
+        H: int = 3,
+        K: int = 16,
+        Kv: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        return (
+            torch.randn([B, Mq, H, K], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, Mkv, H, K], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, Mkv, H, Kv], device="cuda", dtype=dtype) * 3,
+            torch.randn([B, H, Mq, Mkv], device="cuda", dtype=dtype) * 3,
+        )
+
+    @staticmethod
+    def pad_bias(bias: torch.Tensor) -> torch.Tensor:
+        align_to = 16
+        if (bias.shape[-1] % align_to) == 0:
+            return bias
+        pad_count = align_to - (bias.shape[-1] % align_to)
+        return torch.nn.functional.pad(bias, [0, pad_count])[:, :, :, : bias.shape[-1]]
+
+    def test_f16_biasf32(self) -> None:
+        q, k, v, bias = self.create_tensors(torch.float16)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        bias = bias.to(torch.float32)
+        with pytest.raises((ValueError, RuntimeError)):
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+
+    def test_f32_biasf16(self) -> None:
+        q, k, v, bias = self.create_tensors(torch.float32)
+        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+        bias = bias.to(torch.float16)
+        with pytest.raises((ValueError, RuntimeError)):
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+    def test_wrong_alignment(self, dtype) -> None:
+        op = fmha.cutlass.FwOp
+        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
+        try:
+            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
+            return
+        except (ValueError, RuntimeError):
+            pass
+        # This case is not supported, likely due to padding issues
+        # Let's make sure it works with padding
+        assert bias.ndim == 4, bias.shape
+        bias_padded = self.pad_bias(bias)
+        out = fmha.memory_efficient_attention(
+            q, k, v, attn_bias=bias_padded, op=(op, None)
+        ).float()
+        ref_out = ref_attention_bmhk(q, k, v, bias)
+        assert_allclose(
+            out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
+        )
+
+    def test_permuted_attn_bias(self) -> None:
+        op = fmha.cutlass.FwOp
+        dtype = torch.float16
+        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
+        bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
+        # Either it works, or it raises an exception
+        # but we should never get a CUDA error
+        try:
+            out = fmha.memory_efficient_attention(
+                q, k, v, attn_bias=bias, op=(op, None)
+            ).float()
+            ref_out = ref_attention_bmhk(q, k, v, bias)
+            assert_allclose(
+                out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
+            )
+        except (ValueError, RuntimeError):
+            pass
+
+
+SM_AND_SHMEM_KBYTES = [
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+    (50, 64),
+    (60, 64),
+    (70, 96),
+    (75, 64),
+    (80, 163),
+    (86, 99),
+    (89, 99),
+    # (90, 227),
+]
+
+
+@cuda_only
+@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
+@pytest.mark.parametrize(
+    "sm_shmem",
+    SM_AND_SHMEM_KBYTES,
+    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
+)
+def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
+    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
+    sm, shmem_kbytes = sm_shmem
+    if sm < 80 and dtype_str == "bf16":
+        return
+
+    for k in [16, 32, 64, 128, 256]:
+        assert torch.ops.xformers._has_cutlassF_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+        assert torch.ops.xformers._has_cutlassB_kernel_for(
+            dtype, sm, shmem_kbytes * 1024, k
+        ), f"k={k}"
+
+
+def test_window_size_materialize() -> None:
+    seqlens = [4, 6]
+    attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+        q_seqlen=seqlens,
+        kv_seqlen=seqlens,
+    ).make_local_attention(2)
+    mask = attn_bias.materialize(
+        (1, 1, sum(seqlens), sum(seqlens)),
+        device="cpu",
+        dtype=torch.float32,
+    )
+    true_mask = torch.log(
+        torch.Tensor(
+            [
+                [
+                    [
+                        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
+                    ]
+                ]
+            ]
+        )
+    )
+    assert torch.all(mask == true_mask)
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "opFW_biasT",
+    [
+        (op, biasT)
+        for op in ALL_FW_OPS
+        for biasT in op.SUPPORTED_ATTN_BIAS_TYPES
+        if op.SUPPORTS_BMGHK
+    ],
+)
+def test_forward_gqa(opFW_biasT):
+    opFW, biasT = opFW_biasT
+    B_Mq_Mkv_H_K_Kv = (3, 512, 512, 16, 128, 128)
+    test_forward(
+        (
+            opFW,
+            "cuda",
+            torch.float16,
+            biasT,
+            *B_Mq_Mkv_H_K_Kv,
+        ),
+        packed=False,
+        fmt="BMGHK",
+        g=2,
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize(
+    "opBW",
+    [
+        fmha.flash.BwOp,
+        fmha.cutlass.BwOp,
+    ],
+)
+def test_backward_gqa(opBW):
+    H = 8
+    B_Mq_Mkv_H_K_Kv = (3, 512, 512, H, 128, 128)
+    dtype = torch.float16
+    query, key, value, attn_bias = create_tensors(
+        *(opBW, "cuda", dtype, type(None), *B_Mq_Mkv_H_K_Kv),
+        attn_bias_requires_grad=False,
+        fmt="BMHK",
+    )
+    op = (fmha.cutlass.FwOp, opBW)
+    key = key[:, :, :1].expand(-1, -1, H, -1)
+    value = value[:, :, :1].expand(-1, -1, H, -1)
+    key.requires_grad_(True)
+    out = fmha.memory_efficient_attention(query, key, value, attn_bias=attn_bias)
+    out_ref = ref_attention_bmhk(query, key, value, attn_bias=attn_bias)
+    assert_allclose(
+        out.float(),
+        out_ref.float(),
+        atol=op[0].ERROR_ATOL[dtype],
+        rtol=op[0].ERROR_RTOL[dtype],
+    )
+    out.backward(query)
+    dk = key.grad
+    key.grad = None
+    out_ref.backward(query)
+    assert_allclose(
+        dk.float(),
+        key.grad.float(),
+        atol=op[1].ERROR_ATOL[dtype],
+        rtol=op[1].ERROR_RTOL[dtype],
+    )
+
+
+@cuda_only
+@pytest.mark.parametrize("opFW", [op for op in ALL_FW_OPS if op.SUPPORTS_BMGHK])
+def test_forward_gqa_one_group(opFW):
+    dtype = torch.float16
+    B, Mq, Mkv, H, K = 3, 13, 16, 5, 128
+    q = torch.randn([B, Mq, 1, H, K], dtype=dtype, device="cuda") * 3
+    k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
+    v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
+
+    supported = opFW.supports(fmha.Inputs(q, k, v))
+    if not supported:
+        supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
+        assert supported == supported_bmhk
+        pytest.skip("not supported")
+    out = fmha.memory_efficient_attention_forward(q, k, v, op=opFW)
+    ref = ref_attention(q, k, v)
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=opFW.ERROR_ATOL[dtype],
+        rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
+'''
+@sm80_or_better_only
+def test_flash_gqa_wrong_strides() -> None:
+    op = (fmha.flash.FwOp, None)
+    device = "cuda"
+    B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
+    q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
+    kv = torch.empty((B, Mkv, G, H, K), dtype=torch.float16, device=device)
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, H, G, K), dtype=torch.float16, device=device).permute(
+        0, 1, 3, 2, 4
+    )
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, G, 1, K), dtype=torch.float16, device=device)
+    with pytest.raises(ValueError):
+        fmha.memory_efficient_attention(q, kv, kv, op=op)
+    kv = kv.expand(-1, -1, -1, H, K)
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+
+    kv = torch.empty((B, Mkv, G, H, 2 * K), dtype=torch.float16, device=device)[
+        :, :, :, :, :K
+    ]
+    fmha.memory_efficient_attention(q, kv, kv, op=op)
+'''
+
+def _dispatches_to_splitK(q, kv):
+    return (
+        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0]
+        is fmha.triton_splitk.FwOp
+    )
+
+
+def _dispatches_to_flash_decoding(q, kv):
+    return (
+        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
+    )
+
+
+def test_dispatch_decoding_bmhk() -> None:
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
+    ), "Should not use SplitK with 1 head (no tensorcores)"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 32, 128]),
+        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should use Flash-Decoding with BMHK MQA"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 32, 128]),
+        torch.empty([1, 2048, 32, 128]),
+    ), "Should not use SplitK when no TensorCores"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 128, 32, 128]),
+        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should not use SplitK if q seqlen is long"
+    assert not _dispatches_to_splitK(
+        torch.empty([128, 8, 32, 128]),
+        torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
+    ), "Should not use SplitK if B is big"
+
+
+def test_dispatch_decoding_bmghk() -> None:
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
+    ), "Should not use SplitK with 1 head (no tensorcores)"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should use Flash-Decoding with MQA"
+    assert _dispatches_to_flash_decoding(
+        torch.empty([1, 8, 4, 32, 128]),
+        torch.empty([1, 2048, 4, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should use Flash-Decoding with GQA"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 8, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 32, 128]),
+    ), "Should not use SplitK when no TensorCores"
+    assert not _dispatches_to_splitK(
+        torch.empty([1, 128, 1, 32, 128]),
+        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should not use SplitK if q seqlen is long"
+    assert not _dispatches_to_splitK(
+        torch.empty([128, 8, 1, 32, 128]),
+        torch.empty([128, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
+    ), "Should not use SplitK if B is big"
+
+
+shapes_triton_splitk = [
+    (1, 8, 2**16, 1, 128, 128),
+    (1, 4, 2**16, 1, 128, 128),
+    (1, 16, 2**16, 1, 128, 128),
+    (1, 16, 2**16, 1, 32, 32),
+    (1, 8, 1025, 1, 128, 128),
+    (2, 8, 4096, 1, 128, 128),
+    (10, 8, 2**16, 1, 128, 128),
+    (10, 15, 2**16, 1, 128, 128),
+    (1, 3, 2**16, 1, 128, 128),
+    (1, 3, 2**16 - 10, 1, 128, 128),
+    (2, 3, 73, 1, 128, 128),
+    (2, 7, 7328, 1, 128, 128),
+    (2, 7, 7328, 1, 120, 120),
+    (2, 7, 63, 1, 120, 120),
+]
+op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk = [
+    (fmha.triton_splitk.FwOp, "cuda", torch.float16, type(None), *s)
+    for s in shapes_triton_splitk
+] + [
+    (fmha.triton_splitk.FwOp, "cuda", torch.bfloat16, type(None), *s)
+    for s in shapes_triton_splitk
+]
+
+
+@pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk,
+    ids=[make_id(*c) for c in op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk],
+)
+@cuda_only
+def test_forward_splitk(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+    packed=False,
+    fmt="BMHK",
+):
+    test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed=packed, fmt=fmt)
+
+
+@cuda_only
+@pytest.mark.parametrize("op", [fmha.triton_splitk.FwOp])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "B_Mkv_H_K",
+    [
+        (1, 2**16, 3, 128),
+        (5, 53, 4, 64),
+    ],
+)
+def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
+    B, Mkv, H, K = B_Mkv_H_K
+    q = torch.randn([B, 1, H, K], dtype=dtype, device="cuda") * 3
+    k = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
+    v = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
+    k = k.expand(-1, -1, H, -1)
+    v = v.expand(-1, -1, H, -1)
+
+    if not op.supports(fmha.Inputs(q, k, v)):
+        pytest.skip("not supported")
+    out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
+    ref = ref_attention(q, k, v)
     assert_allclose(
         out.float(),
         ref,
@@ -756,3 +2029,204 @@ def test_mqa_forward(
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_query(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    query = query[:, :0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    assert out.shape[1] == 0
+    out.backward(out)
+    # dK/dV should be all zeros
+    assert_allclose(key.grad, torch.zeros_like(key.grad), "key.grad")
+    assert_allclose(value.grad, torch.zeros_like(value.grad), "value.grad")
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_kv(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    key = key[:, :0]
+    value = value[:, :0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    assert_allclose(out, torch.zeros_like(out), "out")
+    out.backward(out)
+    # dQ should be all zeros
+    assert_allclose(query.grad, torch.zeros_like(query.grad), "query.grad")
+
+
+@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
+def test_empty_tensors_empty_b(
+    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+):
+    query, key, value, attn_bias = create_tensors(
+        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
+        fmt="BMHK",
+    )
+    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
+
+    query, key, value = query[:0], key[:0], value[:0]
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
+    out.backward(out)
+
+
+def test_local_attn_bias() -> None:
+    mask = (
+        fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
+        .materialize(shape=(4, 4))
+        .exp()
+    )
+
+    expected = torch.tensor(
+        [[1, 1, 1, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.float32
+    )
+    assert (mask == expected).all().item()
+
+
+@cuda_only
+@pytest.mark.parametrize("cc", [60, 70, 80])
+@pytest.mark.parametrize("maxK", [32, 64, 128, 256])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "custom_mask_type",
+    [
+        fmha.cutlass._CustomMaskType.NoCustomMask,
+        fmha.cutlass._CustomMaskType.CausalFromTopLeft,
+        fmha.cutlass._CustomMaskType.CausalFromBottomRight,
+    ],
+)
+@pytest.mark.parametrize("window_size", [0, 3, 300])
+@pytest.mark.parametrize(
+    "num_queries,num_keys",
+    [
+        (30, 66),
+        (256, 256),
+        # Edge cases
+        (314, 320),
+        (32, 256),
+        (224, 226),
+        (5, 531),
+        (320, 332),  # for win_size=300
+        # Others
+        (256, 62),
+        (256, 63),
+        (256, 64),
+        (256, 65),
+        (256, 66),
+    ],
+)
+def test_cutlassB_iter_order(
+    dtype,
+    cc: int,
+    maxK: int,
+    num_queries: int,
+    num_keys: int,
+    custom_mask_type,
+    window_size,
+) -> None:
+    """
+    This tests some internals of the cutlassB kernel
+    We test the iteration across blocks of [queries, keys] to ensure
+    that we correctly:
+    * Iterate over all the blocks that should be iterated
+    * Do *not* iterate over blocks that are completely masked out
+    * Correctly compute the number of parallel blocks that will compute
+        the same block of dQ
+    .. and we test this across variable causal masks+local attention combinations
+    """
+    if (
+        window_size > 0
+        and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
+    ):
+        pytest.skip("LocalAttention is only supported for causal")
+    get_iteration_data = partial(
+        torch.ops.xformers._cutlassB_iteration_data,
+        dtype=dtype,
+        cc=cc,
+        maxK=maxK,
+        num_queries=num_queries,
+        num_keys=num_keys,
+        custom_mask_type=custom_mask_type,
+        window_size=window_size,
+    )
+    bias = torch.zeros([num_queries, num_keys], dtype=torch.float32)
+    if custom_mask_type != fmha.cutlass._CustomMaskType.NoCustomMask:
+        bias = fmha.attn_bias._materialize_causal_mask(
+            (num_queries, num_keys),
+            dtype=torch.float32,
+            device="cpu",
+            window_size=None if window_size == 0 else window_size,
+            from_bottomright=(
+                custom_mask_type == fmha.cutlass._CustomMaskType.CausalFromBottomRight
+            ),
+        )
+
+    block_queries, block_keys = get_iteration_data()[:2]
+    mask_pooled = (
+        F.max_pool2d(bias.unsqueeze(0), (block_queries, block_keys), ceil_mode=True)
+        == 0
+    ).int()[0]
+    attn_computed = torch.zeros_like(mask_pooled)
+    for key_start in range(0, num_keys, block_keys):
+        it = 0
+        new_key_start = key_start
+        new_query_start = get_iteration_data(key_start=key_start)[2]
+        try:
+            expected_first_query = (
+                mask_pooled[:, key_start // block_keys].tolist().index(1)
+                * block_queries
+            )
+            assert (
+                new_query_start == expected_first_query
+            ), f"Wrong first query for K={key_start}: {new_query_start} (expected {expected_first_query})"
+        except ValueError:  # Nothing to compute in this column
+            pass
+
+        while new_key_start == key_start and new_query_start < num_queries:
+            query_start = new_query_start
+            attn_computed[query_start // block_queries, key_start // block_keys] += 1
+            # print(f"Compute [{query_start}, {key_start}]")
+
+            # Is there something to compute here?
+            assert mask_pooled[
+                query_start // block_queries, key_start // block_keys
+            ].item(), "Computing a block that is not needed!"
+            new_query_start, new_key_start = get_iteration_data(
+                key_start=key_start, query_start=query_start
+            )[3:5]
+            it += 1
+            assert it < num_queries, ""
+        assert (attn_computed == mask_pooled)[
+            :, key_start // block_keys
+        ].all(), "some blocks were not computed!"
+
+    # Now check that the number returned by `getNumParallelBlocksForQuery` is correct
+    for query_start in range(0, num_queries, block_queries):
+        num_parallel_blocks = get_iteration_data(
+            query_start=query_start, num_splits_key=num_keys
+        )[5]
+        num_actual = mask_pooled[query_start // block_queries].sum().item()
+        assert num_parallel_blocks == num_actual
+# end of file
diff --git a/tests/test_mqa_forward_ck_tiled.py b/tests/test_mqa_forward_ck_tiled.py
new file mode 100644
index 000000000..e3c1f488c
--- /dev/null
+++ b/tests/test_mqa_forward_ck_tiled.py
@@ -0,0 +1,673 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+
+import pytest
+import torch
+from scipy.stats import binomtest
+from torch.utils.checkpoint import checkpoint
+
+import xformers.ops
+from xformers.ops import fmha
+from xformers.ops.fmha.common import AttentionOpBase
+
+from .utils import assert_allclose
+
+torch.backends.cuda.matmul.allow_tf32 = False
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_types = [torch.float16, torch.bfloat16]
+
+T = TypeVar(
+    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
+)
+
+ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
+    fmha.ck.FwOp,
+]
+
+ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
+    fmha.ck.BwOp,
+]
+
+def sample_random_supported_fw(
+    inp: fmha.Inputs, seed: int
+) -> Type[fmha.common.AttentionFwOpBase]:
+    r = random.Random(seed)
+    fw_ops = list(ALL_FW_OPS)
+    r.shuffle(fw_ops)
+    for op in fw_ops:
+        if op.supports(inp):
+            return op
+    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
+
+
+def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+    shapes = []
+    for B in op._TEST_BATCH_SIZES:
+        for Mq in [32, 256]:
+            for Mkv in [32, 64, 256, 1024]:
+                for K in op._TEST_K:
+                    shapes.append((B, Mq, Mkv, 1, K, K))
+        Mq = 256
+        Mkv = 128
+        K = 32
+        H = 1
+        # Weird values of parameters
+        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
+            shapes.append((B, M, Mkv, H, K, K))
+            shapes.append((B, Mq, M, H, K, K))
+        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
+            if _K <= op.SUPPORTED_MAX_K:
+                shapes.append((B, Mq, Mkv, H, _K, _K))
+        # Different value for K / Kv
+        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+            for _K in [32, 36, 64, 256 + 8]:
+                shapes.append((B, Mq, Mkv, H, K, _K))
+                shapes.append((B, Mq, Mkv, H, _K, K))
+        # Exotic sizes
+        for K in op._TEST_K:
+            shapes.append((B, 16, 1024, H, K, K))
+            shapes.append((B, 1024, 16, H, K, K))
+        # Some number of heads
+        for H in [3, 5, 12]:
+            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
+    # Filter-out not supported shapes
+    shapes = [
+        shape
+        for shape in shapes
+        if len(
+            op.shape_not_supported_reasons(
+                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
+            )
+        )
+        == 0
+    ]
+    # Add some random shapes
+    if op in [
+        fmha.ck.FwOp,
+        fmha.ck.BwOp,
+    ]:
+        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
+        r = random.Random(0)
+        found_count = 0
+        while found_count < 20:
+            B = r.randint(1, 400)
+            Mq = r.randint(1, 500)
+            Mkv = r.randint(1, 500)
+            H = r.randint(2, 11)
+            B = max(B // H, 1)
+            K = r.choice(K_CHOICES)
+            Kv = r.choice(K_CHOICES)
+            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
+                Kv = K
+            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
+                continue
+            found_count += 1
+            shapes.append((B, Mq, Mkv, H, K, Kv))
+    return shapes
+
+
+def make_id(op, device, dtype, bias_type, *shape):
+    return (
+        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+        f"-{'-'.join([str(s) for s in shape])}"
+    )
+
+
+def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
+    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
+):
+    r = random.Random(0)
+    combination = []
+    ids = []
+    for op in ops_list:
+        op_count = 0
+        # Sort list of masks, so it's deterministic across runs
+        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
+        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
+            has_one = False
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    bias_type = r.choice(LIST_MASKS)
+                    # Avoid using too much memory
+                    if bias_type not in [
+                        type(None),
+                        fmha.attn_bias.LowerTriangularMask,
+                    ]:
+                        B, Mq, Mkv, H, K, Kv = shape
+                        B = min(B, 12)
+
+                        if (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
+                        elif (
+                            bias_type
+                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
+                        ):
+                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
+                        shape = (B, Mq, Mkv, H, K, Kv)
+                    combination.append((op, device, dtype, bias_type, *shape))
+                    ids.append(
+                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
+                        f"-{'-'.join([str(s) for s in shape])}"
+                    )
+                    has_one = True
+            if has_one:
+                op_count += 1
+            if op_count > max_shapes_per_op:
+                break
+        # Some specific shapes for which we want to run without any mask
+        bias_type = type(None)
+        for shape in (
+            # Some strides/dims don't fit on an uint16
+            (1, 128, 128, 300, 128, 128),
+            (13, 1, 67, 200, 8, 8),
+            (1, 1 + 2**16, 4, 1, 8, 8),
+            (1, 4, 1 + 2**16, 1, 8, 8),
+            # TODO: Some strides don't fit on an uint32
+            # Crashes on Flash, Errors on Cutlass
+            # (1, 1, 64000, 300, 128, 128)
+        ):
+            for device in _devices:
+                if device not in op.SUPPORTED_DEVICES:
+                    continue
+                for dtype in op.SUPPORTED_DTYPES:
+                    combination.append((op, device, dtype, bias_type, *shape))
+    return {
+        "argvalues": combination,
+        "ids": [make_id(*c) for c in combination],
+    }
+
+
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
+)
+parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
+)
+parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
+    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
+    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
+)
+
+def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+    if q.ndim == 4:
+        B, M, Hq, K = q.shape
+        _, N, Hkv, Kv = v.shape
+        nhead_ratio_qk = Hq // Hkv
+
+        def attn_bias_head(head: int):
+            if isinstance(attn_bias, torch.Tensor):
+                assert attn_bias.ndim == 4
+                _, H, _, _ = attn_bias.shape        
+                assert H == Hq
+                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+                return bias_bghmn[:, :, head]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                assert attn_bias._bias.ndim == 4
+                _, H, _, _ = attn_bias._bias.shape        
+                assert H == Hq
+                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    bias_bghmn[:, :, head]
+                )
+            return attn_bias
+
+        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
+
+        return torch.stack(
+            [
+                ref_attention_bmhk(
+                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
+                )
+                for h in range(q_bmghk.shape[3])
+            ],
+            dim=3,
+        ).reshape((B, M, Hq, Kv))
+     
+    assert q.ndim == 3
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
+
+    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
+    q = q * scale
+
+    attn = q @ k.transpose(-2, -1)
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=dtype,
+            )
+        else:
+            attn_bias_tensor = attn_bias.to(dtype=dtype)
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+        attn = attn + attn_bias_tensor
+    attn = attn.softmax(-1)
+    if drop_mask is not None:
+        attn = attn * (drop_mask / (1 - p))
+    return attn @ v
+
+
+def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+
+def _rand_seqlens(
+    r: random.Random,
+    bs: int,
+    q_len: int,
+    kv_len: int,
+    more_keys_than_queries_per_block: bool,
+) -> Tuple[Sequence[int], Sequence[int]]:
+    """
+    Generates lists of lengths of query blocks and corresponding key blocks.
+    The total number of queries will be bs * q_len and the
+    total number of keys will be bs * kv_len.
+    """
+    if more_keys_than_queries_per_block:
+        assert kv_len >= q_len
+    q_len *= bs
+    kv_len *= bs
+    seqlens_q: List[int] = []
+    seqlens_k: List[int] = []
+
+    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
+    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
+    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
+        num_queries = r.randrange(*step_q)
+        seqlens_q.append(num_queries)
+
+        if more_keys_than_queries_per_block:
+            # Must select at least `num_queries` keys
+            # But also leave enough keys for later
+            keys_left = kv_len - sum(seqlens_k, 0)
+            queries_left = q_len - sum(seqlens_q[:-1], 0)
+            assert keys_left >= queries_left
+            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
+        else:
+            seqlens_k.append(r.randrange(*step_k))
+    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
+    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
+    return seqlens_q, seqlens_k
+
+
+def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
+    # returns list of n nonnegative integers summing to total
+    idx = {0, total}
+    while len(idx) < n + 1:
+        idx.add(r.randint(1, total - 1))
+    s = sorted(idx)
+    return [e - b for b, e in zip(s[:-1], s[1:])]
+
+
+def _rand_maxed_partition(
+    r: random.Random, total: int, n: int, mx: int, positive: bool = True
+) -> List[int]:
+    # returns list of n nonnegative integers less than mx summing to total
+    # NB: This is unfortunately biased towards evenly-split bins.
+    # If `positive`, outputs are positive
+    if positive:
+        total -= n
+        mx -= 1
+    idxs = r.sample(range(n * mx), total)
+    y = torch.zeros(n, mx, dtype=torch.int32)
+    y.flatten()[idxs] = 1
+    z = y.sum(1)
+    if positive:
+        z += 1
+    return z.tolist()
+
+
+def _rand_seqlens_padded_k(
+    r: random.Random, bs: int, q_len: int, kv_len: int
+) -> Tuple[Sequence[int], Sequence[int]]:
+    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
+    # we need q_seqlens and k_seqlens to be of len bsz.
+    # For each "batch element" there must be more keys than queries
+    # because this bias type is "bottom right" and so any extra queries
+    # will attend to nothing and have undefined result.
+    # In addition every element of k_seqlens must be <= kv_len
+    if q_len > kv_len:
+        raise ValueError("need more keys than values")
+    if q_len == kv_len:
+        # all key slots are needed so we cannot have padding
+        q_seqlens = k_seqlens = [kv_len] * bs
+    else:
+        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
+        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
+    return q_seqlens, k_seqlens
+
+
+def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
+    align_to = 8
+    return (
+        torch.randn(
+            (
+                B,
+                H,
+                Mq,
+                align_to * ((Mkv + align_to - 1) // align_to),
+            ),
+            **kwargs,
+        )
+        * 3
+    )[:, :, :, :Mkv]
+
+
+def create_attn_bias(
+    bias_type,
+    batch_size: int,
+    num_heads: int,
+    q_len: int,
+    kv_len: int,
+    device,
+    dtype,
+    requires_grad: bool,
+    fmt: str,
+    op: Type[AttentionOpBase],
+):
+    if bias_type is None or isinstance(None, bias_type):
+        return None
+    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
+    if bias_type is torch.Tensor:
+        if fmt == "BMK":
+            batch_size *= num_heads
+            num_heads = 1
+        # `small_k` only supports an expanded 1d bias
+        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
+            attn_bias = (
+                torch.randn(
+                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
+                )
+                * 3
+            )
+            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
+        else:
+            attn_bias = _create_aligned_bias(
+                batch_size,
+                num_heads,
+                q_len,
+                kv_len,
+                device=device,
+                dtype=dtype,
+            )
+            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
+            #       with the data read by one-thread
+            # make sure it also works if the first columns are partially masked out
+            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        if fmt == "BMK":
+            attn_bias = attn_bias[:, 0]
+        return attn_bias
+    if bias_type is fmha.attn_bias.LowerTriangularMask:
+        return fmha.attn_bias.LowerTriangularMask()
+    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
+        attn_bias = _create_aligned_bias(
+            batch_size,
+            num_heads,
+            q_len,
+            kv_len,
+            device=device,
+            dtype=dtype,
+        )
+        if requires_grad:
+            attn_bias.requires_grad_(True)
+        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
+    if bias_type in [
+        fmha.attn_bias.BlockDiagonalMask,
+        fmha.attn_bias.BlockDiagonalCausalMask,
+        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+    ]:
+        # This bias is not supported in BMK format
+        assert fmt == "BMHK"
+        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+            *_rand_seqlens(
+                r,
+                batch_size,
+                q_len,
+                kv_len,
+                more_keys_than_queries_per_block=bias_type
+                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            )
+        )
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
+            block_diag = block_diag.make_causal()
+        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
+            block_diag = block_diag.make_causal_from_bottomright()
+        return block_diag
+    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
+        assert fmt == "BMHK"
+        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
+        g_block_diag = (
+            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                q_seqlen=q,
+                kv_padding=kv_len,
+                kv_seqlen=k,
+            )
+        )
+        return g_block_diag
+
+    assert False, f"Unsupported bias type: {bias_type}"
+
+
+def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
+    tensor_with_grad: Optional[torch.Tensor] = None
+    if isinstance(attn_bias, torch.Tensor):
+        tensor_with_grad = attn_bias
+    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+        tensor_with_grad = attn_bias._bias
+    if tensor_with_grad is not None:
+        grad = tensor_with_grad.grad
+        if clear:
+            tensor_with_grad.grad = None
+        return grad
+    return None
+
+
+def create_tensors(
+    op: Type[AttentionOpBase],
+    device,
+    dtype,
+    attn_bias_type,
+    B,
+    q_len,
+    kv_len,
+    h,
+    k,
+    kv,
+    *,
+    attn_bias_requires_grad: bool = False,
+    fmt: str = "BMK",
+):
+    torch.manual_seed(B * q_len + kv_len * k + kv)
+    scale = 3
+    if fmt == "BMK":
+        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
+    else:
+        assert fmt == "BMHK"
+        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
+        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
+        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
+
+    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
+        attn_bias_type = None
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=h,
+            q_len=q_len,
+            kv_len=kv_len,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt=fmt,
+            op=op,
+        )
+        if isinstance(
+            attn_bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            query, key, value = [
+                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
+            ]
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+        pytest.skip(err_msg)
+    return query, key, value, attn_bias
+
+
+def bmhk2bmk(tensor) -> torch.Tensor:
+    return (
+        tensor.permute((0, 2, 1, 3))
+        .contiguous()
+        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
+    )
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
+@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
+@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
+@pytest.mark.parametrize("batches", [100, 64, 1])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize("op", [fmha.ck.FwOp])
+def test_mqa_forward(
+    op,
+    attn_bias_type,
+    dtype, 
+    batches: int, 
+    seqlen_kv: int, 
+    seqlen_q: int, 
+    nhead_kv: int, 
+    nhead_q: int, 
+    hdim_v: int, 
+    hdim_k: int, 
+):
+    B = batches
+    M = seqlen_q
+    N = seqlen_kv
+    Hq = nhead_q
+    Hkv = nhead_kv
+    K = hdim_k
+    Kv = hdim_v
+
+    print("Hq=", Hq, "Hkv=", Hkv)
+
+    device = torch.device("cuda")
+
+    if not (K == Kv and (Kv == 64 or Kv == 128)):
+        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
+
+    if Kv > 128:
+        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+
+    scale = 3
+    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
+    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
+    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
+
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=Hq,
+            q_len=M,
+            kv_len=N,
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+            fmt="BMHK",
+            op=op,
+        )
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 94b36c235..856e64651 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -522,24 +522,21 @@ struct FmhaFwdKernel
                 {
                     if(kargs.mask_type == CausalMaskType::MaskDisabled)
                     {
-                        ck::index_t lr_size = kargs.window_size / 2;
+                        ck::index_t left_size  = kargs.window_size / 2;
+                        ck::index_t right_size = kargs.window_size - 1 - left_size;
 
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            lr_size, lr_size, kargs.seqlen_q, kargs.seqlen_k);
+                            left_size, right_size, kargs.seqlen_q, kargs.seqlen_k);
                     }
                     else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft)
                     {
-                        ck::index_t lr_size = kargs.window_size / 2;
-
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            lr_size, 0, kargs.seqlen_q, kargs.seqlen_k, true);
+                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, true);
                     }
                     else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromBottomRight)
                     {
-                        ck::index_t lr_size = kargs.window_size / 2;
-
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            lr_size, 0, kargs.seqlen_q, kargs.seqlen_k, false);
+                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, false);
                     }
                 }
                 else
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 3cb4ed014..67e71ccd6 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -4,8 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from dataclasses import replace
 from enum import Enum
-from typing import Any, List, Mapping, Optional, Set, Tuple, Union
+from functools import partial
+from typing import Any, List, Optional, Set, Tuple, Union, Mapping
 
 import torch
 
@@ -13,9 +15,13 @@
 from . import attn_bias
 from .attn_bias import (
     AttentionBias,
+    BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+    BlockDiagonalCausalLocalAttentionMask,
     BlockDiagonalCausalMask,
     BlockDiagonalCausalWithOffsetPaddedKeysMask,
     BlockDiagonalMask,
+    LowerTriangularFromBottomRightLocalAttentionMask,
+    LowerTriangularFromBottomRightMask,
     LowerTriangularMask,
     LowerTriangularMaskWithTensorBias,
 )
@@ -25,29 +31,34 @@
     Context,
     Gradients,
     Inputs,
+    _attn_bias_apply,
     check_lastdim_alignment_stride1,
 )
 
 def _minimum_gemm_alignment(inp: Inputs) -> int:
     return 1
 
-
 def _get_seqlen_info(
     inp: Inputs,
-) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int, int]:
     attn_bias = inp.attn_bias
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
+        ##attn_bias.k_seqinfo.to(inp.query.device)
+        ##attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
+        ##max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
     else:
         seqstart_k = None
         seqstart_q = None
         max_seqlen_q = -1
+        ##max_seqlen_k = -1
+
+    return seqstart_k, seqstart_q, max_seqlen_q,
 
-    return seqstart_k, seqstart_q, max_seqlen_q
 
 def _get_tensor_bias(
     attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
@@ -100,7 +111,6 @@ def _check_large_shapes(reasons: List[str], inp: Inputs) -> None:
             "Input is too large: product of first two dimensions of q/k/v must be < 2**20"
         )
 
-
 class _CustomMaskType(int, Enum):
     """
     (Matches CustomMaskType in C++.)
@@ -117,14 +127,18 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
         (
             LowerTriangularMask,
             BlockDiagonalCausalMask,
+            BlockDiagonalCausalLocalAttentionMask,
         ),
     ):
         return int(_CustomMaskType.CausalFromTopLeft)
     if isinstance(
         bias,
         (
+            LowerTriangularFromBottomRightMask,
+            LowerTriangularFromBottomRightLocalAttentionMask,
             attn_bias.BlockDiagonalCausalFromBottomRightMask,
             BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            BlockDiagonalCausalLocalAttentionFromBottomRightMask,
         ),
     ):
         return int(_CustomMaskType.CausalFromBottomRight)
@@ -134,26 +148,48 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
 @register_operator
 class FwOp(AttentionFwOpBase):
     """xFormers' MHA kernel based on Composable Kernel.
-    Supports AMD MI 200 and MI 300 GPUs
     """
 
+    ### ck_check_op is temporarily used to check ck-tiled availability
+    ck_check_op = get_xformers_operator("is_ck_tiled_used")
+    use_ck_tiled = ck_check_op()
+ 
     OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 65536
-    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-        type(None),
-        torch.Tensor,
-        LowerTriangularMask,
-        LowerTriangularMaskWithTensorBias,
-        BlockDiagonalMask,
-        BlockDiagonalCausalMask,
-        BlockDiagonalCausalWithOffsetPaddedKeysMask,
-        attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    }
+    
+    if use_ck_tiled: 
+        SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+            type(None),
+            torch.Tensor,
+            LowerTriangularMask,
+            LowerTriangularFromBottomRightMask,
+            LowerTriangularFromBottomRightLocalAttentionMask,
+            LowerTriangularMaskWithTensorBias,
+            BlockDiagonalMask,
+            BlockDiagonalCausalMask,
+            BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            attn_bias.BlockDiagonalCausalFromBottomRightMask,
+            attn_bias.BlockDiagonalCausalLocalAttentionMask,
+            BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+        }
+    else:
+        SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+            type(None),
+            torch.Tensor,
+            LowerTriangularMask,
+            LowerTriangularMaskWithTensorBias,
+            BlockDiagonalMask,
+            BlockDiagonalCausalMask,
+            BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            attn_bias.BlockDiagonalCausalFromBottomRightMask,
+         }
+
     SUPPORTS_DROPOUT = True
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
+    SUPPORTS_BMGHK = True
     NAME = "ckF"
 
     ERROR_ATOL: Mapping[torch.dtype, float] = {
@@ -176,6 +212,70 @@ class FwOp(AttentionFwOpBase):
     @classmethod
     def apply(
         cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
+            raise NotImplementedError("Unsupported attn_bias type")
+        if inp.query.ndim in [3, 4]:
+            return cls.apply_bmhk(inp, needs_gradient=needs_gradient)
+        assert inp.query.ndim == 5, f"query has shape {inp.query.shape}"
+        ctx: Optional[Context] = None
+        # XXX: Hackfix for BMGHK with H=1
+        # In that case we don't want to run G different streams because it adds
+        # some overhead
+        if inp.query.ndim == 5 and inp.query.shape[3] == 1:
+            slice_op = partial(torch.squeeze, dim=3)
+            inp = replace(
+                inp,
+                query=slice_op(inp.query),
+                key=slice_op(inp.key),
+                value=slice_op(inp.value),
+                attn_bias=_attn_bias_apply(
+                    inp.attn_bias, partial(torch.squeeze, dim=2)
+                ),
+            )
+            out, ctx = cls.apply_bmhk(inp, needs_gradient=needs_gradient)
+            out = out.unsqueeze(3)
+            if ctx is not None:
+                ctx = replace(ctx, lse=ctx.lse.unsqueeze(1), out=out)
+            return out, ctx
+
+        # Workaround until this is properly implemented in C++
+        # run each head group in a different stream
+        n_groups = inp.key.shape[2]
+        main_stream = torch.cuda.current_stream()
+        streams = [main_stream] + [
+            torch.cuda.Stream(device=inp.query.device) for _ in range(n_groups - 1)
+        ]
+        outs = []
+        for group, stream in enumerate(streams):
+            stream.wait_stream(main_stream)
+            with torch.cuda.stream(stream):
+                query = inp.query[:, :, group]
+                key = inp.key[:, :, group]
+                value = inp.value[:, :, group]
+                bias = _attn_bias_apply(
+                    inp.attn_bias, partial(torch.select, dim=1, index=group)
+                )
+                outs.append(
+                    cls.apply_bmhk(
+                        replace(inp, query=query, key=key, value=value, attn_bias=bias),
+                        needs_gradient=needs_gradient,
+                    )
+                )
+        for s in streams[1:]:
+            main_stream.wait_stream(s)
+        out = torch.stack([o[0] for o in outs], dim=2)
+        if needs_gradient:
+            ctx = Context(
+                out=out,
+                lse=torch.stack([o[1].lse for o in outs], dim=1),  # type: ignore
+                op_bw=outs[0][1].op_bw,  # type: ignore
+            )
+        return out, ctx
+
+    @classmethod
+    def apply_bmhk(
+        cls, inp: Inputs, needs_gradient: bool
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
@@ -195,8 +295,18 @@ def apply(
             seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
-            window_size=0,
+            window_size=inp.attn_bias._window_size
+            if isinstance(
+                inp.attn_bias,
+                (
+                    BlockDiagonalCausalLocalAttentionMask,
+                    BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                    LowerTriangularFromBottomRightLocalAttentionMask,
+                ),
+            )
+            else None,
         )
+
         ctx: Optional[Context] = None
         if needs_gradient:
             ctx = Context(
@@ -233,6 +343,7 @@ def operator_flop(
         b,
         seqstart_q,
         seqstart_k,
+        max_seqlen_q_,
         compute_lse,
         custom_mask_type,
         *a,
@@ -259,11 +370,16 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
+        LowerTriangularFromBottomRightMask,
+        # TODO: Still some infs/nans in the BW pass for
+        # local + causal
+        # LowerTriangularFromBottomRightLocalAttentionMask,
         # TODO: Fix handling of gradient through the fMHA autograd function
         # LowerTriangularMaskWithTensorBias,
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        attn_bias.BlockDiagonalCausalLocalAttentionMask,
     }
     SUPPORTS_ATTN_BIAS_GRAD = True
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
@@ -271,14 +387,6 @@ class BwOp(AttentionBwOpBase):
     SUPPORTS_DIFFERENT_VALUE_EMBED = FwOp.SUPPORTS_DIFFERENT_VALUE_EMBED
     NAME = "ckB"
 
-    ERROR_ATOL: Mapping[torch.dtype, float] = {
-        torch.float: 5e-4,
-        # increased from 9e-2, more opportunities for numerical errors when bias is
-        # used, noticed in gK on SM80
-        torch.half: 1e-1,
-        torch.bfloat16: 7e-1,
-    }
-
     _TEST_K: List[int] = [
         32,  # 64x64 kernel
         128,  # 64x128/128x128 kernel
@@ -323,7 +431,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
 
-        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q, max_seqlen_k = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
         rng_seed = rng_offset = 0
@@ -361,6 +469,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
         )
+        
 
         # c++/CUDA implementation returns an uninitialized tensor if bias doesn't
         # require grad
@@ -382,6 +491,8 @@ def operator_flop(
         b,
         cu_seqlens_q,
         cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
         logsumexp,
         output,
         dropout_p,

From a27403c4d3f4ed74a8bd7e3dc2c0cd89bc79cc68 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 17:59:14 +0000
Subject: [PATCH 340/641] Synchronize submodule composable_kernel to the latest
 commits

---
 third_party/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 5f4e6ec00..719219b9f 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 5f4e6ec00d12654e3897f53b48307434cd25a02f
+Subproject commit 719219b9f1f4143e5fdd657dd16b704a22821766

From dfc2618a710f4ffaf7d72f4b790e24b536a3be8f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 18:02:28 +0000
Subject: [PATCH 341/641] Make the efficient_attention_forward_ck() C++
 interface consistent with the updating of xformers/ops/fmha API

---
 xformers/csrc/attention/attention.cpp                     | 8 --------
 .../csrc/attention/hip_fmha/attention_forward_generic.cpp | 6 +++++-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 3989ebd29..73ee37ea6 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -25,19 +25,11 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 #if defined(USE_ROCM)
-#if defined(USE_CK_TILED_KERNEL)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, "
           "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
           "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
           "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
-#else
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
-          "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
-          "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
-#endif
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
           "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 244e134a4..c4bbc72eb 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -6,6 +6,7 @@
  */
 #include <cmath>
 #include <mutex>
+#include <tuple>
 
 #include <ATen/Context.h>
 #include <ATen/ScalarOps.h>
@@ -57,8 +58,11 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
     bool compute_logsumexp,
     int64_t custom_mask_type,
     c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k)
+    const c10::optional<at::Tensor>& seqlen_k,
+    const c10::optional<int64_t> window_size)
 {
+    std::ignore = window_size;
+
     TORCH_CHECK(query.dim() == 4);
     TORCH_CHECK(key.dim() == 4);
     TORCH_CHECK(value.dim() == 4);

From 5421612bfaf382f1c30ce8cd6c2b7af00a948f1a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 18:03:24 +0000
Subject: [PATCH 342/641] Tiny fix in ck.py to make test_backward pass

---
 xformers/ops/fmha/ck.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 67e71ccd6..200f6a41b 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -370,7 +370,7 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
-        LowerTriangularFromBottomRightMask,
+        ##LowerTriangularFromBottomRightMask,
         # TODO: Still some infs/nans in the BW pass for
         # local + causal
         # LowerTriangularFromBottomRightLocalAttentionMask,
@@ -379,7 +379,7 @@ class BwOp(AttentionBwOpBase):
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        ##attn_bias.BlockDiagonalCausalLocalAttentionMask,
     }
     SUPPORTS_ATTN_BIAS_GRAD = True
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
@@ -431,7 +431,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
 
-        seqstart_k, seqstart_q, max_seqlen_q, max_seqlen_k = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
         rng_seed = rng_offset = 0

From 7948fe6674af2cf3c9a44bd01cc404b0afe7fc96 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 11 Jan 2024 00:09:09 +0000
Subject: [PATCH 343/641] some refactorings for standalone tests

---
 .../hip_fmha/attention_forward_splitk.cpp     | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 61dac9a8b..aa60950de 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -685,10 +685,12 @@ struct FMHADecoderReduceDeviceOp : public BaseOperator
 } // namespace tensor_operation
 } // namespace ck
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(const at::Tensor& XQ,
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const at::Tensor& XQ,
                                                                            const at::Tensor& K,
                                                                            const at::Tensor& V,
-                                                                           const at::Tensor& seqlen)
+                                                                           const at::Tensor& seqlen,
+                                                                           const int32_t split_k,
+                                                                           const int32_t wavefronts_per_block)
 {
 
     at::OptionalDeviceGuard guard(XQ.device());
@@ -700,17 +702,15 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(const
     auto D = XQ.size(4);
 
     double qk_scale        = 1. / sqrt(D);
-    constexpr auto split_k = 1;
 
     auto O                    = at::empty_like(XQ);
-    constexpr auto splitk_dim = 0;
     constexpr auto rank       = 5;
-    auto split_O              = at::stack(O, splitk_dim);
-    auto split_max            = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
-    auto split_sumexp         = at::empty_like(split_max);
+    auto split_O              = at::zeros({split_k, B, M, G, H, D}, XQ.options());
+    auto split_max            = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat)).fill_(ck::NumericLimits<float>::Lowest());
+    auto split_sumexp         = at::zeros_like(split_max);
 
     dim3 blocks(B * H * M * G, split_k);
-    dim3 threads(kThreadsPerWavefront, kWavefrontsPerBlock);
+    dim3 threads(kThreadsPerWavefront, wavefronts_per_block);
 
     constexpr int32_t KV_M_MAX = 8192;
     constexpr int32_t K_MAX    = 4 * kThreadsPerWavefront;
@@ -775,7 +775,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_hip(const
             auto invoker = device_op_t::Invoker{};
             (void)invoker.Run(arg, {stream});
         });
-    return std::make_tuple(split_O[splitk_dim], split_max, split_sumexp);
+    return std::make_tuple(split_O, split_max, split_sumexp);
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -799,33 +799,31 @@ generate_inputs(const int32_t padding,
     auto K           = (G == 1) ? at::randn({B, padding, G, Hkv, D}, options)
                       : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
     auto V = at::randn_like(K);
-    // auto seqlen = at::randint(1, padding + 1, {B}, int_options);
-    // auto seqlen = at::tensor({1062}, int_options);
-    auto seqlen = at::tensor({6, 12, 13, 9, 32, 10, 12, 6}, int_options);
+    auto seqlen = at::randint(num_queries, padding + 1, {B}, int_options);
 
     return std::make_tuple(XQ, K, V, seqlen);
 }
 
 static void test_split1_attention()
 {
-    auto [XQ, K, V, seqlen] = generate_inputs(4096, 1, 16, 16);
+    auto [XQ, K, V, seqlen] = generate_inputs(4096, 8, 16, 16);
 
-    auto reference_result = split1_attention_torch(XQ, K, V, seqlen);
+    auto [O_ref, m_ref, l_ref] = split1_attention_torch(XQ, K, V, seqlen);
 
-    auto hip_result = split1_attention_hip(XQ, K, V, seqlen);
+    auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, /* split_k */ 1, /* wavefronts_per_block */ 1);
 
-    auto O_match_mask = at::isclose(std::get<0>(reference_result),
-                                    std::get<0>(hip_result),
+    auto O_match_mask = at::isclose(O_ref,
+                                    O_hip,
                                     /*atol*/ 1e-3,
                                     /*rtol*/ 1e-5,
                                     /*equal_nan*/ false);
-    auto m_match_mask = at::isclose(std::get<1>(reference_result),
-                                    std::get<1>(hip_result),
+    auto m_match_mask = at::isclose(m_ref,
+                                    m_hip,
                                     /*atol*/ 1e-3,
                                     /*rtol*/ 1e-5,
                                     /*equal_nan*/ false);
-    auto l_match_mask = at::isclose(std::get<2>(reference_result),
-                                    std::get<2>(hip_result),
+    auto l_match_mask = at::isclose(l_ref,
+                                    l_hip,
                                     /*atol*/ 1e-3,
                                     /*rtol*/ 1e-5,
                                     /*equal_nan*/ false);
@@ -839,28 +837,28 @@ static void test_split1_attention()
     printf("Mismatched split_max elements percentage: %.2f\n", 1. - m_percent_match.item<float>());
 
     printf("Mismatched split_sumexp elements percentage: %.2f\n",
-           1. - m_percent_match.item<float>());
+           1. - l_percent_match.item<float>());
 }
 
 static void do_correctness_check()
 {
-    auto [XQ, K, V, seqlen] = generate_inputs(32, 8, 16, 16);
+    auto [XQ, K, V, seqlen] = generate_inputs(4096, 8, 16, 16);
 
     double qk_scale        = 1. / sqrt(XQ.size(-1));
     constexpr auto split_k = 2;
 
-    auto result = efficient_attention_forward_decoder_splitk_ck_impl<64, 1>(
+    auto result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
         XQ, K, V, seqlen, qk_scale, split_k);
     auto gold_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
     auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
     auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    auto nan_count     = at::sum(at::isnan(result));
-    auto numel         = result.numel();
-    auto inf_count     = at::sum(at::isinf(result));
+    // auto nan_count     = at::sum(at::isnan(result));
+    // auto numel         = result.numel();
+    // auto inf_count     = at::sum(at::isinf(result));
     printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
     // printf("k_seqlen: %d\n", seqlen.item<int32_t>());
-    std::cout << "numel: " << numel << " nan count: " << nan_count << " inf count: " << inf_count
-              << std::endl;
+    // std::cout << "numel: " << numel << " nan count: " << nan_count << " inf count: " << inf_count
+    //           << std::endl;
     std::cout << "k_seqlen: " << seqlen << std::endl;
 }
 

From e7ffe6897e6ce224abc4a7d2318ef4dbb84926e9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 11 Jan 2024 20:27:04 +0000
Subject: [PATCH 344/641] cleanup testing

---
 .../hip_fmha/attention_forward_splitk.cpp     | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index aa60950de..df9ffdbe4 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -25,7 +25,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
     // }
 
     // causal mask
-    auto neg_inf = at::tensor(-99.).item();
+    auto neg_inf = at::tensor(-1001.).item();
     for(size_t b = 0; b < k_seqlens.numel(); ++b)
     {
         auto seqlen = k_seqlens[b].item<int64_t>();
@@ -789,6 +789,8 @@ generate_inputs(const int32_t padding,
     const int32_t G           = Hq / Hkv;
     const int32_t num_queries = 1;
 
+    at::manual_seed(1);
+
     auto options = torch::TensorOptions()
                        .dtype(dtype)
                        .layout(torch::kStrided)
@@ -840,33 +842,35 @@ static void test_split1_attention()
            1. - l_percent_match.item<float>());
 }
 
-static void do_correctness_check()
+static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
 {
-    auto [XQ, K, V, seqlen] = generate_inputs(4096, 8, 16, 16);
+    auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
     double qk_scale        = 1. / sqrt(XQ.size(-1));
-    constexpr auto split_k = 2;
 
     auto result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
         XQ, K, V, seqlen, qk_scale, split_k);
     auto gold_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
     auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
     auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    // auto nan_count     = at::sum(at::isnan(result));
-    // auto numel         = result.numel();
-    // auto inf_count     = at::sum(at::isinf(result));
-    printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
-    // printf("k_seqlen: %d\n", seqlen.item<int32_t>());
-    // std::cout << "numel: " << numel << " nan count: " << nan_count << " inf count: " << inf_count
-    //           << std::endl;
-    std::cout << "k_seqlen: " << seqlen << std::endl;
+    printf("Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, 1. - percent_match.item<float>());
 }
 
 int main(int argc, char** argv)
 {
     if(argc == 1)
     {
-        do_correctness_check();
+        for (auto padding : {32, 4096}) {
+            for (auto batch_size : {1, 8}) {
+                for (auto Hq : { 16 }) {
+                    for (auto Hkv : { 16 }) {
+                        for (auto split_k : {1, 2, 4}) {
+                            test_splitk_decoder_e2e_correctness(padding, batch_size, Hq, Hkv, split_k);
+                        }
+                    }
+                }
+            }
+        }
 
         // test_split1_attention();
     }

From 495310180fbde6acf7cedbc6df249dda7801b091 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 18:02:28 +0000
Subject: [PATCH 345/641] Make the efficient_attention_forward_ck() C++
 interface consistent with the updating of xformers/ops/fmha API

---
 xformers/csrc/attention/attention.cpp                     | 8 --------
 .../csrc/attention/hip_fmha/attention_forward_generic.cpp | 6 +++++-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 3989ebd29..73ee37ea6 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -25,19 +25,11 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 #if defined(USE_ROCM)
-#if defined(USE_CK_TILED_KERNEL)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, "
           "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
           "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
           "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
-#else
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
-          "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
-          "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k) -> (Tensor, Tensor, int, int)"));
-#endif
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
           "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index 244e134a4..c4bbc72eb 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -6,6 +6,7 @@
  */
 #include <cmath>
 #include <mutex>
+#include <tuple>
 
 #include <ATen/Context.h>
 #include <ATen/ScalarOps.h>
@@ -57,8 +58,11 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
     bool compute_logsumexp,
     int64_t custom_mask_type,
     c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k)
+    const c10::optional<at::Tensor>& seqlen_k,
+    const c10::optional<int64_t> window_size)
 {
+    std::ignore = window_size;
+
     TORCH_CHECK(query.dim() == 4);
     TORCH_CHECK(key.dim() == 4);
     TORCH_CHECK(value.dim() == 4);

From e99fc1ac42d5ade8e989b2ebf530c59c062bdf45 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Jan 2024 18:03:24 +0000
Subject: [PATCH 346/641] Tiny fix in ck.py to make test_backward pass

---
 xformers/ops/fmha/ck.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 67e71ccd6..200f6a41b 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -370,7 +370,7 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
-        LowerTriangularFromBottomRightMask,
+        ##LowerTriangularFromBottomRightMask,
         # TODO: Still some infs/nans in the BW pass for
         # local + causal
         # LowerTriangularFromBottomRightLocalAttentionMask,
@@ -379,7 +379,7 @@ class BwOp(AttentionBwOpBase):
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        ##attn_bias.BlockDiagonalCausalLocalAttentionMask,
     }
     SUPPORTS_ATTN_BIAS_GRAD = True
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
@@ -431,7 +431,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
 
-        seqstart_k, seqstart_q, max_seqlen_q, max_seqlen_k = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
         rng_seed = rng_offset = 0

From d7721d233e87496c39d66f78d4cdc36ba22d3262 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 11 Jan 2024 22:06:13 +0000
Subject: [PATCH 347/641] fix split1 attention csrc test

---
 .../hip_fmha/attention_forward_splitk.cpp     | 77 ++++++++++---------
 1 file changed, 39 insertions(+), 38 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index df9ffdbe4..cb0101d6e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -16,42 +16,32 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
     const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens)
 {
     auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
-    auto S        = at::einsum("mghk, nghk -> mghn",
-                        {Q_scaled.flatten(0, 1), K.flatten(0, 1)},
-                        /* einsum eval path */ at::nullopt);
 
-    // for (size_t i = 0; i < S.dim(); ++i) {
-    //   std::cout << "S.dim" << i << "=" << S.size(i) << std::endl;
-    // }
+    std::vector<at::Tensor> O_batch;
+    std::vector<at::Tensor> m_batch;
+    std::vector<at::Tensor> l_batch;
 
-    // causal mask
-    auto neg_inf = at::tensor(-1001.).item();
-    for(size_t b = 0; b < k_seqlens.numel(); ++b)
-    {
+    for(size_t b = 0; b < k_seqlens.numel(); ++b) {
         auto seqlen = k_seqlens[b].item<int64_t>();
-        at::slice(S[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).fill_(neg_inf);
-        at::slice(S[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ S.size(-1))
-            .fill_(neg_inf);
-        // std::cout << "batch" << b << " ; masked QK^T dim " << S[b].dim() << " values at h0 " <<
-        // S[b].slice(1, 0, 1) << std::endl;
-    }
 
-    auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
-    auto s = at::exp(at::sub(S, m));
-
-    // causal mask
-    for(size_t b = 0; b < k_seqlens.numel(); ++b)
-    {
-        auto seqlen = k_seqlens[b].item<int64_t>();
-        at::slice(s[b], /* dim */ -1, /* start */ 0, /* end */ b * K.size(1)).zero_();
-        at::slice(s[b], /* dim */ -1, /* start */ b * K.size(1) + seqlen, /* end */ s.size(-1))
-            .zero_();
+        auto S        = at::einsum("mghk, nghk -> mghn",
+                        {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ 0, /*end*/ seqlen)},
+                        /* einsum eval path */ at::nullopt);
+        auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+        auto s = at::exp(at::sub(S, m));
+        auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+        auto O =
+        at::einsum("mghn, nghk -> mghk", {s, at::slice(V[b], /*dim*/ 0, /*start*/ 0, /*end*/ seqlen)}, /* einsum eval path */ at::nullopt);
+        O_batch.push_back(O);
+        m_batch.push_back(m);
+        l_batch.push_back(l);
     }
 
-    auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-    auto O =
-        at::einsum("mghn, nghk -> mghk", {s, V.flatten(0, 1)}, /* einsum eval path */ at::nullopt);
-    return std::make_tuple(O, m, l);
+    auto O_cat = at::stack(O_batch);
+    auto m_cat = at::stack(m_batch);
+    auto l_cat = at::stack(l_batch);
+
+    return std::make_tuple(O_cat, m_cat, l_cat);
 }
 
 static at::Tensor
@@ -806,9 +796,9 @@ generate_inputs(const int32_t padding,
     return std::make_tuple(XQ, K, V, seqlen);
 }
 
-static void test_split1_attention()
+static void test_split1_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv)
 {
-    auto [XQ, K, V, seqlen] = generate_inputs(4096, 8, 16, 16);
+    auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
     auto [O_ref, m_ref, l_ref] = split1_attention_torch(XQ, K, V, seqlen);
 
@@ -834,12 +824,15 @@ static void test_split1_attention()
     auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
     auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
 
-    printf("Mismatched split_O elements percentage: %.2f\n", 1. - O_percent_match.item<float>());
+    printf("Padding=%d BS=%d Hq=%d Hkv=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
+            padding,
+            batch_size,
+            Hq,
+            Hkv,
+            1. - O_percent_match.item<float>(),
+            1. - m_percent_match.item<float>(),
+            1. - l_percent_match.item<float>());
 
-    printf("Mismatched split_max elements percentage: %.2f\n", 1. - m_percent_match.item<float>());
-
-    printf("Mismatched split_sumexp elements percentage: %.2f\n",
-           1. - l_percent_match.item<float>());
 }
 
 static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
@@ -872,7 +865,15 @@ int main(int argc, char** argv)
             }
         }
 
-        // test_split1_attention();
+        for (auto padding : {32, 4096}) {
+            for (auto batch_size : {1, 8}) {
+                for (auto Hq : { 16 }) {
+                    for (auto Hkv : { 16 }) {
+                        test_split1_attention(padding, batch_size, Hq, Hkv);
+                    }
+                }
+            }
+        }
     }
     else
     {

From 902910a1bf85e3bf26f8735d59c3ba75e0d16c79 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 12 Jan 2024 16:02:57 +0000
Subject: [PATCH 348/641] Enable support of flexible head-dim size (but <= 128)
 for ck-tiled fmha forward

---
 tests/test_forward_ck_tiled.py                |   7 +-
 third_party/composable_kernel_tiled           |   2 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 191 +++++++-----------
 .../hip_fmha/ck_tiled_fmha_definitions.h      |  87 ++++++--
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |  89 +++++---
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |   2 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 143 ++++++-------
 7 files changed, 286 insertions(+), 235 deletions(-)

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index a0685d88e..e76f52e09 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -437,11 +437,8 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if not (k == kv and (kv == 64 or kv == 128)):
-        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
-
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+    if k > 128 or kv > 128:
+        pytest.skip("k or kv bigger than 128 is not supported by CK-FlashAttention")
 
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 539f9677e..cd4c0600f 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 539f9677e047da576f67810f7833dd983df3c1f8
+Subproject commit cd4c0600f37288f09736d910378efeb18a8c4142
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 2ea3d4f50..61786c50d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -38,73 +38,51 @@
 template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
 struct batched_infer_causalmask_attnbias_dispatched
 {
-    using QDataType           = scalar_t;
-    using KDataType           = scalar_t;
-    using VDataType           = scalar_t;
-    using BiasDataType        = scalar_t;
-    using SaccDataType        = float;    // data type for first gemm accumulation
-    using SMPLComputeDataType = float;    // data type for reduction, softmax
-    using PDataType           = scalar_t; // data type for A matrix of second gemm
-    using OaccDataType        = float;    // data type for second gemm accumulation
-    using ODataType           = scalar_t;
-
-    using VLayout = ck::tensor_layout::gemm::RowMajor;
-
-    using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
-    using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
-    using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
-    using FmhaWarpTile         = ck::Sequence<32, 32, 16>;
-    using FmhaShapeHDim64      = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim64,
-                                                            FmhaBlockWarps,
-                                                            FmhaWarpTile,
-                                                            FmhaBlockWarps,
-                                                            FmhaWarpTile,
-                                                            VLayout>;
-    using FmhaShapeHDim128     = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim128,
-                                                             FmhaBlockWarps,
-                                                             FmhaWarpTile,
-                                                             FmhaBlockWarps,
-                                                             FmhaWarpTile,
-                                                             VLayout>;
-
-    using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
+    using FmhaEpilogue =
+        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
 #ifndef BATCHED_INFER_HEADDIM_SWITCH
-#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)        \
-    [&] {                                                              \
-        if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64)                  \
-        {                                                              \
-            using FmhaShape = FmhaShapeHDim64;                         \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128)            \
-        {                                                              \
-            using FmhaShape = FmhaShapeHDim128;                        \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else                                                           \
-        {                                                              \
-            throw std::runtime_error("Head-dim sizes not supported!"); \
-        }                                                              \
+#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
+    [&] {                                                                   \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                              \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 32;                          \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                         \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 64;                          \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
+        else if(HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128)                       \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 128;                         \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            throw std::runtime_error("Head-dim sizes not supported!");      \
+        }                                                                   \
     }()
 #endif
 
-    template <typename FmhaTraits, typename FmhaShape, typename FmhaMask>
-    using FmhaPipelineProblemTemp =
-        ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                          KDataType,
-                                                          VDataType,
-                                                          SaccDataType,
-                                                          SMPLComputeDataType,
-                                                          BiasDataType,
-                                                          PDataType,
-                                                          OaccDataType,
-                                                          ODataType,
-                                                          256, // BlockSize
-                                                          FmhaShape,
-                                                          false, // kIsGroupMode
-                                                          FmhaMask,
-                                                          FmhaTraits>;
+    template <typename FmhaTraits, ck::index_t HDim, typename FmhaMask>
+    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
+        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+        HDim == 32 ? 128 : 256, // BlockSize
+        FmhaFwdShape<HDim>,
+        false, // kIsGroupMode
+        FmhaMask,
+        FmhaTraits>;
 
     static void Run(BatchedForwardParams& param, hipStream_t stream)
     {
@@ -116,59 +94,42 @@ struct batched_infer_causalmask_attnbias_dispatched
             using FmhaMask =
                 ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-            BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-                using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-
-                if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 == 0)
-                {
-                    using FmhaTraits =
-                        ck::tile_program::TileFmhaTraits<false, false, has_attn_bias>;
-
-                    using FmhaPipelineProblem =
-                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                }
-                else if(param.M % FmhaShape::kM0 == 0 && param.N % FmhaShape::kN0 != 0)
-                {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<false, true, has_attn_bias>;
-                    using FmhaPipelineProblem =
-                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                }
-                else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 == 0)
-                {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<true, false, has_attn_bias>;
-                    using FmhaPipelineProblem =
-                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                }
-                else if(param.M % FmhaShape::kM0 != 0 && param.N % FmhaShape::kN0 != 0)
-                {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
-                    using FmhaPipelineProblem =
-                        FmhaPipelineProblemTemp<FmhaTraits, FmhaShape, FmhaMask>;
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                };
+            BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+                using FmhaShape                 = FmhaFwdShape<HDim>;
+                using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
+                constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+
+                bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
+                bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
+
+                // ToDO: current pipelines all assume kQLoadOnce, which read whole k0
+                // (kK0BlockLength)
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH_3(
+                    m0_need_padding,
+                    kM0NeedPadding,
+                    n0k1_need_padding,
+                    kN0K1NeedPadding,
+                    k0n1_need_padding,
+                    kK0N1NeedPadding,
+                    [&] {
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                            kN0K1NeedPadding,
+                                                                            kK0N1NeedPadding,
+                                                                            has_attn_bias,
+                                                                            occupancy>;
+
+                        using FmhaPipelineProblem =
+                            FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
+                        using FmhaPipeline =
+                            ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                        using FmhaKernel =
+                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                        RunWithKernel<FmhaKernel>(param, stream);
+                    });
             });
         });
     };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index edaf8a308..0129ac082 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -6,8 +6,6 @@
  */
 #pragma once
 
-//#include <ck/tile_program/block_tile/block_masking_specialization.hpp>
-
 enum struct CausalMaskType
 {
     MaskDisabled,
@@ -15,25 +13,90 @@ enum struct CausalMaskType
     MaskUpperTriangleFromBottomRight
 };
 
-/*
-template <CausalMaskType type>
-struct CausalMaskPredicate;
+template <typename DataType>
+struct FmhaFwdTypeConfig;
+
+template <>
+struct FmhaFwdTypeConfig<ck::half_t>
+{
+    using QDataType           = ck::half_t;
+    using KDataType           = ck::half_t;
+    using VDataType           = ck::half_t;
+    using BiasDataType        = ck::half_t;
+    using SaccDataType        = float;      // data type for first gemm accumulation
+    using SMPLComputeDataType = float;      // data type for reduction, softmax
+    using PDataType           = ck::half_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;      // data type for second gemm accumulation
+    using ODataType           = ck::half_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<ck::bhalf_t>
+{
+    using QDataType           = ck::bhalf_t;
+    using KDataType           = ck::bhalf_t;
+    using VDataType           = ck::bhalf_t;
+    using BiasDataType        = ck::bhalf_t;
+    using SaccDataType        = float;       // data type for first gemm accumulation
+    using SMPLComputeDataType = float;       // data type for reduction, softmax
+    using PDataType           = ck::bhalf_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;       // data type for second gemm accumulation
+    using ODataType           = ck::bhalf_t;
+};
+
+using FmhaFwdVLayout = ck::tensor_layout::gemm::RowMajor;
+
+template <ck::index_t HDim>
+struct FmhaFwdBlockTile;
+
+template <>
+struct FmhaFwdBlockTile<32>
+{
+    using type = ck::Sequence<128, 64, 16, 32, 32, 32>;
+};
+template <>
+struct FmhaFwdBlockTile<64>
+{
+    using type = ck::Sequence<128, 64, 32, 64, 32, 64>;
+};
+template <>
+struct FmhaFwdBlockTile<128>
+{
+    using type = ck::Sequence<128, 128, 32, 128, 32, 128>;
+};
+
+using FmhaFwdBlockWarps = ck::Sequence<4, 1, 1>;
+using FmhaFwdWarpTile   = ck::Sequence<32, 32, 16>;
+
+template <ck::index_t HDim>
+struct FmhaFwdShape;
 
 template <>
-struct CausalMaskPredicate<CausalMaskType::MaskDisabled>
+struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<32>::type,
+                                                          ck::Sequence<2, 1, 1>,
+                                                          FmhaFwdWarpTile,
+                                                          ck::Sequence<2, 1, 1>,
+                                                          FmhaFwdWarpTile,
+                                                          FmhaFwdVLayout>
 {
-    using predicate = ck::tile_program::block::MaskDisabledPredicate;
 };
 
 template <>
-struct CausalMaskPredicate<CausalMaskType::MaskUpperTriangleFromTopLeft>
+struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<64>::type,
+                                                          FmhaFwdBlockWarps,
+                                                          FmhaFwdWarpTile,
+                                                          FmhaFwdBlockWarps,
+                                                          FmhaFwdWarpTile,
+                                                          FmhaFwdVLayout>
 {
-    using predicate = ck::tile_program::block::MaskUpperTriangleFromTopLeftPredicate;
 };
 
 template <>
-struct CausalMaskPredicate<CausalMaskType::MaskUpperTriangleFromBottomRight>
+struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<128>::type,
+                                                           FmhaFwdBlockWarps,
+                                                           FmhaFwdWarpTile,
+                                                           FmhaFwdBlockWarps,
+                                                           FmhaFwdWarpTile,
+                                                           FmhaFwdVLayout>
 {
-    using predicate = ck::tile_program::block::MaskUpperTriangleFromBottomRightPredicate;
 };
-*/
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 856e64651..a248f3525 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -41,6 +41,7 @@ struct FmhaFwdKernel
     static constexpr bool kIsGroupMode     = FmhaPipeline::kIsGroupMode;
     static constexpr bool kM0NeedPadding   = FmhaPipeline::kM0NeedPadding;
     static constexpr bool kN0K1NeedPadding = FmhaPipeline::kN0K1NeedPadding;
+    static constexpr bool kK0N1NeedPadding = FmhaPipeline::kK0N1NeedPadding;
     static constexpr bool kHasBias         = FmhaPipeline::kHasBias;
     using FmhaMask                         = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask         = FmhaMask::IsMasking;
@@ -389,10 +390,20 @@ struct FmhaFwdKernel
                 make_tuple(kargs.stride_q, 1),
                 Number<32>{},
                 Number<1>{});
-
-            return pad_tensor_view(q_dram_naive,
-                                   make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
-                                   Sequence<kM0NeedPadding, false>{});
+            if constexpr(FmhaPipeline::kQLoadOnce)
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0BlockLength>{}),
+                    Sequence<kM0NeedPadding, kK0N1NeedPadding>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{}),
+                    Sequence<kM0NeedPadding, kK0N1NeedPadding>{});
+            }
         }();
         const auto k_dram = [&]() {
             const auto k_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
@@ -402,9 +413,10 @@ struct FmhaFwdKernel
                 Number<32>{},
                 Number<1>{});
 
-            return pad_tensor_view(k_dram_naive,
-                                   make_tuple(Number<FmhaPipeline::kN0>{}, Number<1>{}),
-                                   Sequence<kN0K1NeedPadding, false>{});
+            return pad_tensor_view(
+                k_dram_naive,
+                make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
+                Sequence<kN0K1NeedPadding, kK0N1NeedPadding>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
@@ -427,19 +439,44 @@ struct FmhaFwdKernel
                 /// same as
                 ///   v_dram_transposed.GetTensorDescriptor().GetLength(). Replace following
                 ///   if-clause by pad_tensor_view() call after fixing this issue.
-                if constexpr(kN0K1NeedPadding)
+                if constexpr(kK0N1NeedPadding || kN0K1NeedPadding)
                 {
-                    const index_t pad_length =
-                        FmhaPipeline::kK1 *
-                            ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kK1) -
-                        kargs.seqlen_k;
-
-                    return transform_tensor_view(
-                        v_dram_transposed,
-                        make_tuple(make_pass_through_transform(kargs.hdim_v),
-                                   make_right_pad_transform(kargs.seqlen_k, pad_length)),
-                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+                    const auto transform_n1 = [&] {
+                        if constexpr(kK0N1NeedPadding)
+                        {
+                            const index_t n1_pad_length =
+                                FmhaPipeline::kN1 *
+                                    ck::math::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1) -
+                                kargs.hdim_v;
+
+                            return make_right_pad_transform(kargs.hdim_v, n1_pad_length);
+                        }
+                        else
+                        {
+                            return make_pass_through_transform(kargs.hdim_v);
+                        }
+                    }();
+
+                    const auto transform_k1 = [&] {
+                        if constexpr(kN0K1NeedPadding)
+                        {
+                            const index_t k1_pad_length =
+                                FmhaPipeline::kK1 * ck::math::integer_divide_ceil(
+                                                        kargs.seqlen_k, FmhaPipeline::kK1) -
+                                kargs.seqlen_k;
+
+                            return make_right_pad_transform(kargs.seqlen_k, k1_pad_length);
+                        }
+                        else
+                        {
+                            return make_pass_through_transform(kargs.seqlen_k);
+                        }
+                    }();
+
+                    return transform_tensor_view(v_dram_transposed,
+                                                 make_tuple(transform_n1, transform_k1),
+                                                 make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                                 make_tuple(Sequence<0>{}, Sequence<1>{}));
                 }
                 else
                 {
@@ -455,9 +492,10 @@ struct FmhaFwdKernel
                     Number<32>{},
                     Number<1>{});
 
-                return pad_tensor_view(v_dram_naive,
-                                       make_tuple(Number<1>{}, Number<FmhaPipeline::kK1>{}),
-                                       Sequence<false, kN0K1NeedPadding>{});
+                return pad_tensor_view(
+                    v_dram_naive,
+                    make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+                    Sequence<kK0N1NeedPadding, kN0K1NeedPadding>{});
             }
         }();
 
@@ -587,9 +625,10 @@ struct FmhaFwdKernel
                 Number<32>{},
                 Number<1>{});
 
-            return pad_tensor_view(o_dram_naive,
-                                   make_tuple(Number<FmhaPipeline::kM0>{}, Number<1>{}),
-                                   Sequence<kM0NeedPadding, false>{});
+            return pad_tensor_view(
+                o_dram_naive,
+                make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
+                Sequence<kM0NeedPadding, kK0N1NeedPadding>{});
         }();
 
         auto o_dram_window =
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
index ee385408c..1067eaf7b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
@@ -38,7 +38,7 @@ struct FmhaFwdTilePartitioner
         using namespace ck;
 
         // const index_t num_tile_m0 = seqlen_q / kM0;
-        const index_t num_tile_n1 = hdim_v / kN1;
+        const index_t num_tile_n1 = ck::math::integer_divide_ceil(hdim_v, kN1);
 
         const index_t i_block = blockIdx.x;
         const index_t i_nhead = blockIdx.y;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 5a026dbc9..bc907c8a7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -38,62 +38,52 @@
 template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
 struct grouped_infer_causalmask_attnbias_dispatched
 {
-    using QDataType           = scalar_t;
-    using KDataType           = scalar_t;
-    using VDataType           = scalar_t;
-    using BiasDataType        = scalar_t;
-    using SaccDataType        = float;    // data type for first gemm accumulation
-    using SMPLComputeDataType = float;    // data type for reduction, softmax
-    using PDataType           = scalar_t; // data type for A matrix of second gemm
-    using OaccDataType        = float;    // data type for second gemm accumulation
-    using ODataType           = scalar_t;
-
-    using VLayout = ck::tensor_layout::gemm::RowMajor;
-
-    using FmhaBlockTileHdim64  = ck::Sequence<128, 64, 32, 64, 32, 64>;
-    using FmhaBlockTileHdim128 = ck::Sequence<128, 128, 32, 128, 32, 128>;
-    using FmhaBlockWarps       = ck::Sequence<4, 1, 1>;
-    using FmhaWarpTile         = ck::Sequence<32, 32, 16>;
-    using FmhaShapeHDim64      = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim64,
-                                                            FmhaBlockWarps,
-                                                            FmhaWarpTile,
-                                                            FmhaBlockWarps,
-                                                            FmhaWarpTile,
-                                                            VLayout>;
-    using FmhaShapeHDim128     = ck::tile_program::TileFmhaShape<FmhaBlockTileHdim128,
-                                                             FmhaBlockWarps,
-                                                             FmhaWarpTile,
-                                                             FmhaBlockWarps,
-                                                             FmhaWarpTile,
-                                                             VLayout>;
-
-    using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<OaccDataType, ODataType>>;
-
-    // This is the default setting, the effective setting should be done according to M/N size of
-    // each batch
-    static constexpr bool MNeedPadding = true;
-    static constexpr bool NNeedPadding = true;
+    using FmhaEpilogue =
+        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
 #ifndef GROUPED_INFER_HEADDIM_SWITCH
-#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)        \
-    [&] {                                                              \
-        if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 64)                  \
-        {                                                              \
-            using FmhaShape = FmhaShapeHDim64;                         \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else if(HEAD_DIM1 == HEAD_DIM2 && HEAD_DIM2 == 128)            \
-        {                                                              \
-            using FmhaShape = FmhaShapeHDim128;                        \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else                                                           \
-        {                                                              \
-            throw std::runtime_error("Head-dim sizes not supported!"); \
-        }                                                              \
+#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
+    [&] {                                                                   \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                              \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 32;                          \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                         \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 64;                          \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
+        else if(HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128)                       \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 128;                         \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            throw std::runtime_error("Head-dim sizes not supported!");      \
+        }                                                                   \
     }()
 #endif
 
+    template <typename FmhaTraits, ck::index_t HDim, typename FmhaMask>
+    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
+        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+        HDim == 32 ? 128 : 256, // BlockSize
+        FmhaFwdShape<HDim>,
+        true, // kIsGroupMode
+        FmhaMask,
+        FmhaTraits>;
+
     static void Run(GroupedForwardParams& param, hipStream_t stream)
     {
         const bool has_local_attention = (param.window_size > 0) ? true : false;
@@ -104,31 +94,32 @@ struct grouped_infer_causalmask_attnbias_dispatched
             using FmhaMask =
                 ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-            GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-                using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<true, true, has_attn_bias>;
-                using FmhaPipelineProblem =
-                    ck::tile_program::block::BlockFmhaPipelineProblem<QDataType,
-                                                                      KDataType,
-                                                                      VDataType,
-                                                                      SaccDataType,
-                                                                      SMPLComputeDataType,
-                                                                      BiasDataType,
-                                                                      PDataType,
-                                                                      OaccDataType,
-                                                                      ODataType,
-                                                                      256, // BlockSize
-                                                                      FmhaShape,
-                                                                      true, // IsGroupMode
-                                                                      FmhaMask,
-                                                                      FmhaTraits>;
-
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-
-                using FmhaKernel = FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
+            GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+                using FmhaShape                 = FmhaFwdShape<HDim>;
+                using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
+                constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                constexpr bool kM0NeedPadding   = true;
+                constexpr bool kN0K1NeedPadding = true;
+
+                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                        kN0K1NeedPadding,
+                                                                        kK0N1NeedPadding,
+                                                                        has_attn_bias,
+                                                                        occupancy>;
+
+                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                });
             });
         });
     };

From d1ef4bc8867168f2d60b868ea50b2400a351ae89 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 12 Jan 2024 17:33:08 +0000
Subject: [PATCH 349/641] Use Async pipeline when no any padding used

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 61786c50d..8131ae37f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -123,12 +123,29 @@ struct batched_infer_causalmask_attnbias_dispatched
 
                         using FmhaPipelineProblem =
                             FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
-                        using FmhaPipeline =
-                            ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                        using FmhaKernel =
-                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
-                        RunWithKernel<FmhaKernel>(param, stream);
+                        constexpr bool no_any_padding =
+                            !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
+
+                        if constexpr(no_any_padding)
+                        {
+                            using FmhaPipeline =
+                                ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                                    FmhaPipelineProblem>;
+                            using FmhaKernel =
+                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                            RunWithKernel<FmhaKernel>(param, stream);
+                        }
+                        else
+                        {
+                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                                FmhaPipelineProblem>;
+                            using FmhaKernel =
+                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                            RunWithKernel<FmhaKernel>(param, stream);
+                        };
                     });
             });
         });

From 6cb0f605cf6ac698d8b31ef0b2c89dabc8fddb66 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 12 Jan 2024 20:54:56 +0000
Subject: [PATCH 350/641] implement general split-k split-attention in
 libtorch, use for testing

---
 .../hip_fmha/attention_forward_splitk.cpp     | 84 +++++++++++--------
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index cb0101d6e..cdd46b000 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -12,34 +12,53 @@ constexpr int32_t kWavefrontsPerBlock  = 1;
 constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
 } // namespace
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split1_attention_torch(
-    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens)
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
+    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens, const int32_t split_k)
 {
     auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
 
-    std::vector<at::Tensor> O_batch;
-    std::vector<at::Tensor> m_batch;
-    std::vector<at::Tensor> l_batch;
-
-    for(size_t b = 0; b < k_seqlens.numel(); ++b) {
-        auto seqlen = k_seqlens[b].item<int64_t>();
-
-        auto S        = at::einsum("mghk, nghk -> mghn",
-                        {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ 0, /*end*/ seqlen)},
-                        /* einsum eval path */ at::nullopt);
-        auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
-        auto s = at::exp(at::sub(S, m));
-        auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-        auto O =
-        at::einsum("mghn, nghk -> mghk", {s, at::slice(V[b], /*dim*/ 0, /*start*/ 0, /*end*/ seqlen)}, /* einsum eval path */ at::nullopt);
-        O_batch.push_back(O);
-        m_batch.push_back(m);
-        l_batch.push_back(l);
+    std::vector<at::Tensor> O_splits;
+    std::vector<at::Tensor> m_splits;
+    std::vector<at::Tensor> l_splits;
+
+    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+        std::vector<at::Tensor> O_batch;
+        std::vector<at::Tensor> m_batch;
+        std::vector<at::Tensor> l_batch;
+
+        for(size_t b = 0; b < k_seqlens.numel(); ++b) {
+            auto seqlen = k_seqlens[b].item<int64_t>();
+            const int64_t t_low = split_idx * (seqlen / split_k);
+            const int64_t t_high = (split_idx + 1 < split_k) 
+                                 ? (1 + split_idx) * (seqlen / split_k)
+                                 : seqlen;
+
+            auto S        = at::einsum("mghk, nghk -> mghn",
+                            {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
+                            /* einsum eval path */ at::nullopt);
+            auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+            auto s = at::exp(at::sub(S, m));
+            auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+            auto O        = at::einsum("mghn, nghk -> mghk", 
+                            {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)}, 
+                            /* einsum eval path */ at::nullopt);
+            O_batch.push_back(O);
+            m_batch.push_back(m);
+            l_batch.push_back(l);
+        }
+
+        auto O_cat = at::stack(O_batch);
+        auto m_cat = at::stack(m_batch);
+        auto l_cat = at::stack(l_batch);
+
+        O_splits.push_back(O_cat);
+        m_splits.push_back(m_cat);
+        l_splits.push_back(l_cat);
     }
 
-    auto O_cat = at::stack(O_batch);
-    auto m_cat = at::stack(m_batch);
-    auto l_cat = at::stack(l_batch);
+    auto O_cat = at::stack(O_splits);
+    auto m_cat = at::stack(m_splits);
+    auto l_cat = at::stack(l_splits);
 
     return std::make_tuple(O_cat, m_cat, l_cat);
 }
@@ -235,7 +254,7 @@ at::Tensor efficient_attention_forward_decoder_split1_torch(
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale)
 {
-    auto [O_split, m, l] = split1_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens);
+    auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, /*split_k*/ 1);
     auto O               = split1_reduce_torch(O_split, m, l);
     return O.reshape_as(XQ);
 }
@@ -248,10 +267,6 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck(
     double qk_scale,
     int64_t split_k)
 {
-
-    // return efficient_attention_forward_decoder_split1_torch(XQ, cache_K, cache_V, seq_kv_lens,
-    // qk_scale);
-
     return efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront,
                                                               kWavefrontsPerBlock>(
         XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
@@ -796,13 +811,13 @@ generate_inputs(const int32_t padding,
     return std::make_tuple(XQ, K, V, seqlen);
 }
 
-static void test_split1_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv)
+static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split1_attention_torch(XQ, K, V, seqlen);
+    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k);
 
-    auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, /* split_k */ 1, /* wavefronts_per_block */ 1);
+    auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, split_k, /* wavefronts_per_block */ 1);
 
     auto O_match_mask = at::isclose(O_ref,
                                     O_hip,
@@ -824,11 +839,12 @@ static void test_split1_attention(int32_t padding, int32_t batch_size, int32_t H
     auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
     auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
 
-    printf("Padding=%d BS=%d Hq=%d Hkv=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
+    printf("Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
             padding,
             batch_size,
             Hq,
             Hkv,
+            split_k,
             1. - O_percent_match.item<float>(),
             1. - m_percent_match.item<float>(),
             1. - l_percent_match.item<float>());
@@ -869,7 +885,9 @@ int main(int argc, char** argv)
             for (auto batch_size : {1, 8}) {
                 for (auto Hq : { 16 }) {
                     for (auto Hkv : { 16 }) {
-                        test_split1_attention(padding, batch_size, Hq, Hkv);
+                        for (auto split_k : {1, 2}) {
+                            test_split_attention(padding, batch_size, Hq, Hkv, split_k);
+                        }
                     }
                 }
             }

From 0e04b174d70e6d3738a62904110367d9eef78f1e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 12 Jan 2024 23:37:44 +0000
Subject: [PATCH 351/641] fix split-max and split-sumexp shapes for split
 attention in libtorch

---
 .../csrc/attention/hip_fmha/attention_forward_splitk.cpp    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index cdd46b000..2859787b2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -57,8 +57,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
     }
 
     auto O_cat = at::stack(O_splits);
-    auto m_cat = at::stack(m_splits);
-    auto l_cat = at::stack(l_splits);
+    auto m_cat = at::transpose(at::stack(m_splits), 0, -1);
+    auto l_cat = at::transpose(at::stack(l_splits), 0, -1);
 
     return std::make_tuple(O_cat, m_cat, l_cat);
 }
@@ -66,7 +66,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
 static at::Tensor
 split1_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m, const at::Tensor& l)
 {
-    return at::div(O_splits, l);
+    return at::div(O_splits, at::transpose(l, 0, -1));
 }
 
 namespace {

From e4d6b886fc30bdbe96bf67d5b1a2dde4f4b0bde7 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 13 Jan 2024 00:22:40 +0000
Subject: [PATCH 352/641] implement generic reduce split attention with
 libtorch

---
 .../hip_fmha/attention_forward_splitk.cpp     | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 2859787b2..3a08f145d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -64,9 +64,30 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
 }
 
 static at::Tensor
-split1_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m, const at::Tensor& l)
-{
-    return at::div(O_splits, at::transpose(l, 0, -1));
+split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const at::Tensor& l_splits, int32_t split_k)
+{   
+    auto O = at::zeros_like(at::slice(O_splits, 0, 0, 1));
+    auto m_current_max = at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
+    auto l_current_sum = at::zeros_like(m_current_max);
+
+    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+        auto O_slice = at::slice(O_splits, 0, split_idx, split_idx + 1);
+        auto m_slice = at::slice(m_splits, -1, split_idx, split_idx + 1);
+        auto l_slice = at::slice(l_splits, -1, split_idx, split_idx + 1);
+
+        auto m_new = at::max(m_slice, m_current_max);
+
+        auto pick_new = at::less(m_slice, m_current_max);
+        auto pick_our = at::logical_not(pick_new);
+
+        auto log_alpha = at::neg(at::abs(at::sub(m_slice, m_current_max)));
+        auto alpha = at::exp(log_alpha);
+
+        O = at::add(O, at::add(O_slice, at::mul(at::add(at::mul(pick_our, O), at::mul(pick_new, O_slice)), at::sub(alpha, 1))));
+        l_current_sum = at::add(l_current_sum, at::add(l_slice, at::mul(at::add(at::mul(pick_our, l_current_sum), at::mul(pick_new, l_slice)), at::sub(alpha, 1))));
+    }
+    
+    return at::div(O, l_current_sum);
 }
 
 namespace {
@@ -255,7 +276,7 @@ at::Tensor efficient_attention_forward_decoder_split1_torch(
     double qk_scale)
 {
     auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, /*split_k*/ 1);
-    auto O               = split1_reduce_torch(O_split, m, l);
+    auto O               = split_reduce_torch(O_split, m, l, /*split_k*/ 1);
     return O.reshape_as(XQ);
 }
 

From 17ec43051cf4504150ea1e864a26b4e466d3c078 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 13 Jan 2024 02:14:08 +0000
Subject: [PATCH 353/641] implement testing split reduce hip vs libtorch; tbd
 debug split-k=2 numerical mismatch in this test

---
 .../hip_fmha/attention_forward_splitk.cpp     | 242 +++++++++++-------
 1 file changed, 154 insertions(+), 88 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 3a08f145d..3d106027e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -338,9 +338,9 @@ namespace tensor_operation {
 namespace device {
 
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderSplit1DeviceOp : public BaseOperator
+struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator
 {
-    using DeviceOp = FMHADecoderSplit1DeviceOp;
+    using DeviceOp = FMHADecoderSplitAttentionDeviceOp;
     struct Argument : public BaseArgument
     {
         const scalar_t* __restrict__ XQ;
@@ -548,94 +548,65 @@ struct FMHADecoderSplit1DeviceOp : public BaseOperator
 };
 
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderReduceDeviceOp : public BaseOperator
+struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
 {
-    using DeviceOp = FMHADecoderReduceDeviceOp;
+    using DeviceOp = FMHADecoderSplitReduceDeviceOp;
     struct Argument : public BaseArgument
     {
-        const scalar_t* __restrict__ XQ;
-        const scalar_t* __restrict__ cache_K;
-        const scalar_t* __restrict__ cache_V;
+        const scalar_t* __restrict__ split_O;
+        const compute_t* __restrict__ split_max;
+        const compute_t* __restrict__ split_sumexp;
         scalar_t* __restrict__ O;
-        scalar_t* __restrict__ split_O;
-        compute_t* __restrict__ split_max;
-        compute_t* __restrict__ split_sumexp;
-        const int32_t* __restrict__ seq_kv_lens;
-        const ptrdiff_t XQ_stride_b;
-        const ptrdiff_t XQ_stride_m;
-        const ptrdiff_t XQ_stride_g;
-        const ptrdiff_t XQ_stride_h;
-        const ptrdiff_t K_stride_b;
-        const ptrdiff_t K_stride_m;
-        const ptrdiff_t K_stride_g;
-        const ptrdiff_t K_stride_h;
+
+        const int32_t O_size_m;
+        const int32_t O_size_g;
+        const int32_t O_size_h;
+        const int32_t O_size_k;
+
         const ptrdiff_t O_stride_split;
-        const int32_t Q_size_m;
-        const int32_t Q_size_g;
-        const int32_t Q_size_h;
-        const int32_t Q_size_k;
-        const int32_t K_size_m;
-        const bool multiquery;
-        const float qk_scale;
+        const ptrdiff_t O_stride_b;
+        const ptrdiff_t O_stride_m;
+        const ptrdiff_t O_stride_g;
+        const ptrdiff_t O_stride_h;
+
         const int32_t split_k;
 
         const dim3 grid_dim;
         const dim3 block_dim;
         const size_t lds_bytes;
 
-        Argument(const scalar_t* __restrict__ XQ,
-                 const scalar_t* __restrict__ cache_K,
-                 const scalar_t* __restrict__ cache_V,
+        Argument(const scalar_t* __restrict__ split_O,
+                 const compute_t* __restrict__ split_max,
+                 const compute_t* __restrict__ split_sumexp,
                  scalar_t* __restrict__ O,
-                 scalar_t* __restrict__ split_O,
-                 compute_t* __restrict__ split_max,
-                 compute_t* __restrict__ split_sumexp,
-                 const int32_t* __restrict__ seq_kv_lens,
-                 const ptrdiff_t XQ_stride_b,
-                 const ptrdiff_t XQ_stride_m,
-                 const ptrdiff_t XQ_stride_g,
-                 const ptrdiff_t XQ_stride_h,
-                 const ptrdiff_t K_stride_b,
-                 const ptrdiff_t K_stride_m,
-                 const ptrdiff_t K_stride_g,
-                 const ptrdiff_t K_stride_h,
+                 const int32_t O_size_m,
+                 const int32_t O_size_g,
+                 const int32_t O_size_h,
+                 const int32_t O_size_k,
                  const ptrdiff_t O_stride_split,
-                 const int32_t Q_size_m,
-                 const int32_t Q_size_g,
-                 const int32_t Q_size_h,
-                 const int32_t Q_size_k,
-                 const int32_t K_size_m,
-                 const bool multiquery,
-                 const float qk_scale,
+                 const ptrdiff_t O_stride_b,
+                 const ptrdiff_t O_stride_m,
+                 const ptrdiff_t O_stride_g,
+                 const ptrdiff_t O_stride_h,
                  const int32_t split_k,
                  // launch params
                  const dim3 grid_dim,
                  const dim3 block_dim,
                  const size_t lds_bytes)
-            : XQ(XQ),
-              cache_K(cache_K),
-              cache_V(cache_V),
-              O(O),
+            : 
               split_O(split_O),
               split_max(split_max),
               split_sumexp(split_sumexp),
-              seq_kv_lens(seq_kv_lens),
-              XQ_stride_b(XQ_stride_b),
-              XQ_stride_m(XQ_stride_m),
-              XQ_stride_g(XQ_stride_g),
-              XQ_stride_h(XQ_stride_h),
-              K_stride_b(K_stride_b),
-              K_stride_m(K_stride_m),
-              K_stride_g(K_stride_g),
-              K_stride_h(K_stride_h),
+              O(O),              
+              O_size_m(O_size_m),
+              O_size_g(O_size_g),
+              O_size_h(O_size_h),
+              O_size_k(O_size_k),
               O_stride_split(O_stride_split),
-              Q_size_m(Q_size_m),
-              Q_size_g(Q_size_g),
-              Q_size_h(Q_size_h),
-              Q_size_k(Q_size_k),
-              K_size_m(K_size_m),
-              multiquery(multiquery),
-              qk_scale(qk_scale),
+              O_stride_b(O_stride_b),
+              O_stride_m(O_stride_m),
+              O_stride_g(O_stride_g),
+              O_stride_h(O_stride_h),
               split_k(split_k),
               // launch params
               grid_dim(grid_dim),
@@ -652,22 +623,22 @@ struct FMHADecoderReduceDeviceOp : public BaseOperator
         {
             auto threads_per_wavefront = arg.block_dim.x;
 
-            auto Q_size_k_alignment_necessary = 0;
+            auto O_size_k_alignment_necessary = 0;
 
             for(auto vec_size : {4, 2, 1})
             {
-                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
+                if(arg.O_size_k <= vec_size * threads_per_wavefront)
                 {
-                    Q_size_k_alignment_necessary = vec_size;
+                    O_size_k_alignment_necessary = vec_size;
                 }
             }
 
-            if(!Q_size_k_alignment_necessary)
+            if(!O_size_k_alignment_necessary)
             {
                 throw std::runtime_error("Unsupported Q_size_k");
             }
 
-            if(arg.Q_size_k % Q_size_k_alignment_necessary)
+            if(arg.O_size_k % O_size_k_alignment_necessary)
             {
                 throw std::runtime_error("Unsupported alignment for Q_size_k");
             }
@@ -677,11 +648,11 @@ struct FMHADecoderReduceDeviceOp : public BaseOperator
             constexpr int32_t reduce_lds_bytes = 0;
             float reduce_result                = launch_and_time_kernel(
                 stream_config,
-                Q_size_k_alignment_necessary == 4
+                O_size_k_alignment_necessary == 4
                     ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
-                    : Q_size_k_alignment_necessary == 2
+                    : O_size_k_alignment_necessary == 2
                           ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
-                          : Q_size_k_alignment_necessary == 1
+                          : O_size_k_alignment_necessary == 1
                                 ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
                                       scalar_t,
                                       1>
@@ -693,15 +664,15 @@ struct FMHADecoderReduceDeviceOp : public BaseOperator
                 arg.split_max,
                 arg.split_sumexp,
                 arg.O,
-                arg.Q_size_m,
-                arg.Q_size_g,
-                arg.Q_size_h,
-                arg.Q_size_k,
+                arg.O_size_m,
+                arg.O_size_g,
+                arg.O_size_h,
+                arg.O_size_k,
                 arg.O_stride_split,
-                arg.XQ_stride_b,
-                arg.XQ_stride_m,
-                arg.XQ_stride_g,
-                arg.XQ_stride_h,
+                arg.O_stride_b,
+                arg.O_stride_m,
+                arg.O_stride_g,
+                arg.O_stride_h,
                 arg.split_k);
             return reduce_result;
         }
@@ -752,10 +723,10 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
         at::ScalarType::BFloat16,
         at::ScalarType::Float,
         XQ.scalar_type(),
-        "efficient_attention_forward_decoder_split1_ck_test",
+        "efficient_attention_forward_decoder_split_attention_ck_test",
         [&] {
             using ck_data_t   = c10_to_data_t<scalar_t>::type;
-            using device_op_t = ck::tensor_operation::device::FMHADecoderSplit1DeviceOp<ck_data_t>;
+            using device_op_t = ck::tensor_operation::device::FMHADecoderSplitAttentionDeviceOp<ck_data_t>;
             auto op           = device_op_t{};
 
             auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
@@ -804,6 +775,76 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
     return std::make_tuple(split_O, split_max, split_sumexp);
 }
 
+static
+at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_max, const at::Tensor& split_sumexp, const int32_t split_k) {
+    at::OptionalDeviceGuard guard(split_O.device());
+
+    auto B = split_O.size(1);
+    auto M = split_O.size(2);
+    auto G = split_O.size(3);
+    auto H = split_O.size(4);
+    auto D = split_O.size(5);
+
+    TORCH_CHECK_EQ(split_k, split_O.size(0));
+    TORCH_CHECK_EQ(split_k, split_max.size(-1));
+    TORCH_CHECK_EQ(split_k, split_sumexp.size(-1));
+
+    constexpr auto rank       = 5;
+
+    TORCH_CHECK_EQ(split_O.dim(), 1 + rank);
+    TORCH_CHECK_EQ(split_max.dim(), rank);
+    TORCH_CHECK_EQ(split_sumexp.dim(), rank);
+
+    auto O = at::empty({B, M, G, H, D}, split_O.options());
+
+    auto stream            = at::cuda::getCurrentHIPStream().stream();
+    auto lds_bytes = 0;
+
+    dim3 blocks(B * H * M * G);
+    dim3 threads(kThreadsPerWavefront);
+
+    AT_DISPATCH_SWITCH_3(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        at::ScalarType::Float,
+        O.scalar_type(),
+        "efficient_attention_forward_decoder_split_reduce_ck_test",
+        [&] {
+            using ck_data_t   = c10_to_data_t<scalar_t>::type;
+            using device_op_t = ck::tensor_operation::device::FMHADecoderSplitReduceDeviceOp<ck_data_t>;
+            auto op           = device_op_t{};
+
+            auto split_O_acc =
+                split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+            auto O_acc   = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+            auto split_sumexp_acc =
+                split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+            auto arg = device_op_t::Argument(
+                reinterpret_cast<const ck_data_t* __restrict__>(split_O_acc.data()),
+                split_max_acc.data(),
+                split_sumexp_acc.data(),
+                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+                O_acc.size(1),
+                O_acc.size(2),
+                O_acc.size(3),
+                O_acc.size(4),
+                split_O_acc.stride(0),
+                split_O_acc.stride(1),
+                split_O_acc.stride(2),
+                split_O_acc.stride(3),
+                split_O_acc.stride(4),
+                split_k,
+                blocks,
+                threads,
+                lds_bytes);
+
+            auto invoker = device_op_t::Invoker{};
+            (void)invoker.Run(arg, {stream});
+        });
+    return O;
+}
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 generate_inputs(const int32_t padding,
                 const int32_t B,
@@ -860,7 +901,7 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
     auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
     auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
 
-    printf("Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
+    printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
             padding,
             batch_size,
             Hq,
@@ -872,6 +913,19 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
 
 }
 
+static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k) {
+    auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
+
+    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k);
+
+    auto O_torch = split_reduce_torch(O_ref, m_ref, l_ref, split_k);
+    auto O_hip = split_reduce_hip(O_ref, m_ref.squeeze(0), l_ref.squeeze(0), split_k);
+
+    auto mask = at::isclose(O_torch, O_hip, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, 1. - percent_match.item<float>());
+}
+
 static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
@@ -883,7 +937,7 @@ static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_s
     auto gold_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
     auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
     auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    printf("Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, 1. - percent_match.item<float>());
+    printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, 1. - percent_match.item<float>());
 }
 
 int main(int argc, char** argv)
@@ -913,6 +967,18 @@ int main(int argc, char** argv)
                 }
             }
         }
+
+        for (auto padding : {32, 4096}) {
+            for (auto batch_size : {1, 8}) {
+                for (auto Hq : { 16 }) {
+                    for (auto Hkv : { 16 }) {
+                        for (auto split_k : {1, 2}) {
+                            test_split_reduce(padding, batch_size, Hq, Hkv, split_k);
+                        }
+                    }
+                }
+            }
+        }
     }
     else
     {

From 69f2f0a901bbd60a0bc039f071e30ff993d130ca Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 15 Jan 2024 18:49:50 +0000
Subject: [PATCH 354/641] refactor repetitive testing code

---
 .../hip_fmha/attention_forward_splitk.cpp     | 55 +++++++++----------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 3d106027e..cd399d0ec 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -82,6 +82,7 @@ split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const
 
         auto log_alpha = at::neg(at::abs(at::sub(m_slice, m_current_max)));
         auto alpha = at::exp(log_alpha);
+        alpha.nan_to_num_(1.);
 
         O = at::add(O, at::add(O_slice, at::mul(at::add(at::mul(pick_our, O), at::mul(pick_new, O_slice)), at::sub(alpha, 1))));
         l_current_sum = at::add(l_current_sum, at::add(l_slice, at::mul(at::add(at::mul(pick_our, l_current_sum), at::mul(pick_new, l_slice)), at::sub(alpha, 1))));
@@ -795,7 +796,7 @@ at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_m
     TORCH_CHECK_EQ(split_max.dim(), rank);
     TORCH_CHECK_EQ(split_sumexp.dim(), rank);
 
-    auto O = at::empty({B, M, G, H, D}, split_O.options());
+    auto O = at::zeros({B, M, G, H, D}, split_O.options());
 
     auto stream            = at::cuda::getCurrentHIPStream().stream();
     auto lds_bytes = 0;
@@ -873,6 +874,12 @@ generate_inputs(const int32_t padding,
     return std::make_tuple(XQ, K, V, seqlen);
 }
 
+static float percent_mismatch(const at::Tensor& a, const at::Tensor& b) {
+    auto mask = at::isclose(a, b, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+    return 1. - percent_match.item<float>();
+}
+
 static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
@@ -881,25 +888,9 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
 
     auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, split_k, /* wavefronts_per_block */ 1);
 
-    auto O_match_mask = at::isclose(O_ref,
-                                    O_hip,
-                                    /*atol*/ 1e-3,
-                                    /*rtol*/ 1e-5,
-                                    /*equal_nan*/ false);
-    auto m_match_mask = at::isclose(m_ref,
-                                    m_hip,
-                                    /*atol*/ 1e-3,
-                                    /*rtol*/ 1e-5,
-                                    /*equal_nan*/ false);
-    auto l_match_mask = at::isclose(l_ref,
-                                    l_hip,
-                                    /*atol*/ 1e-3,
-                                    /*rtol*/ 1e-5,
-                                    /*equal_nan*/ false);
-
-    auto O_percent_match = at::sum(O_match_mask.to(torch::kFloat32)) / O_match_mask.numel();
-    auto m_percent_match = at::sum(m_match_mask.to(torch::kFloat32)) / m_match_mask.numel();
-    auto l_percent_match = at::sum(l_match_mask.to(torch::kFloat32)) / l_match_mask.numel();
+    auto O_percent_mismatch = percent_mismatch(O_ref, O_hip);
+    auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
+    auto l_percent_mismatch = percent_mismatch(l_ref, l_hip);
 
     printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
             padding,
@@ -907,10 +898,9 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
             Hq,
             Hkv,
             split_k,
-            1. - O_percent_match.item<float>(),
-            1. - m_percent_match.item<float>(),
-            1. - l_percent_match.item<float>());
-
+            O_percent_mismatch,
+            m_percent_mismatch,
+            l_percent_mismatch);
 }
 
 static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k) {
@@ -921,9 +911,15 @@ static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, i
     auto O_torch = split_reduce_torch(O_ref, m_ref, l_ref, split_k);
     auto O_hip = split_reduce_hip(O_ref, m_ref.squeeze(0), l_ref.squeeze(0), split_k);
 
-    auto mask = at::isclose(O_torch, O_hip, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, 1. - percent_match.item<float>());
+    double qk_scale = 1. / sqrt(XQ.size(-1));
+    auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
+        XQ, K, V, seqlen, qk_scale, split_k);
+
+    auto hip_gold_mismatch = percent_mismatch(O_hip, gold_result);
+    auto torch_gold_mismatch = percent_mismatch(O_torch, gold_result);
+    auto hip_torch_mismatch = percent_mismatch(O_hip, O_torch);
+    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f hip_gold: %.2f torch_gold: %.2f \n", 
+        padding, batch_size, Hq, Hkv, split_k, hip_torch_mismatch, hip_gold_mismatch, torch_gold_mismatch);
 }
 
 static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
@@ -935,9 +931,8 @@ static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_s
     auto result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
         XQ, K, V, seqlen, qk_scale, split_k);
     auto gold_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
-    auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, 1. - percent_match.item<float>());
+    auto e2e_mismatch = percent_mismatch(result, gold_result);
+    printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, e2e_mismatch);
 }
 
 int main(int argc, char** argv)

From 2d54085f3499a306a6b0dc4ae79c9d888b8f50c2 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 15 Jan 2024 20:00:00 +0000
Subject: [PATCH 355/641] address code review: rearrange loops

---
 .../ck_attention_forward_decoder_splitk.h     | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index d2086405b..c2cd9345d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -293,24 +293,19 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                 load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
             }
         }
-        compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
         for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
         {
+            compute_t qk_acc = 0;
             ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                q_thread, k_loads[ttt], qk_accs[ttt]);
-            qk_accs[ttt] *= qk_scale;
+                q_thread, k_loads[ttt], qk_acc);
+            qk_acc  *= qk_scale;
 
-            qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
-            max_qk_acc   = ck::math::max(qk_accs[ttt], max_qk_acc);
-        }
-        if(lane_idx == 0)
-        {
-            auto* __restrict__ smem_base = smem + tt;
-#pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
+            qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+            max_qk_acc   = ck::math::max(qk_acc, max_qk_acc);
+            if(lane_idx == 0)
             {
-                smem_base[ttt] = qk_accs[ttt];
+                smem[tt + ttt] = qk_acc;
             }
         }
     }

From f937f064562d5c63bf94ea11411f687e8b813fa0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 15 Jan 2024 20:03:38 +0000
Subject: [PATCH 356/641] address code review: add comment about number of
 iterations per split

---
 .../attention/hip_fmha/ck_attention_forward_decoder_splitk.h    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index c2cd9345d..9f3c9c712 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -272,6 +272,8 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     data_vec_t k_loads[n_loop_unroll] = {};
 
     const auto dtt              = wavefronts_per_block * n_loop_unroll;
+    // only last split gets the tail.
+    // the first (split_k - 1) splits have a number of iterations divisible by `dtt`
     const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
     const int32_t tt_low = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
     const int32_t tt_high =

From 7f6b01f7462bd5c587274c6d028050ac852bbadd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 15 Jan 2024 22:29:51 +0000
Subject: [PATCH 357/641] address code review: remove comments

---
 .../attention/hip_fmha/ck_attention_forward_decoder_splitk.h   | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 9f3c9c712..f58fd2732 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -312,7 +312,6 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
         }
     }
 
-    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
     for(auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail)
     {
         if(lane_active_for_io)
@@ -465,8 +464,6 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
             }
         }
     }
-    // now, each thread has partial sums. Write to smem and get accumulated
-    // results back.
     __syncthreads();
 
     // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock

From 187a4bc089a450eda1ea32f3f515781439651776 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 15 Jan 2024 22:39:15 +0000
Subject: [PATCH 358/641] address code review: possibly eliminate a bug by
 using correct timestep range for scaling sumexp in smem

---
 .../hip_fmha/ck_attention_forward_decoder_splitk.h         | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index f58fd2732..e08fe6c08 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -405,8 +405,11 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     // now, compute the normalization across all threads.
     for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
     {
-        // softmax scale by sumexp will happen in the reduction kernel
-        smem[t] = ck::math::exp(smem[t] - max_qk_acc);
+        if (t >= tt_low && t < tt_tail_high)
+        {
+            // softmax scale by sumexp will happen in the reduction kernel
+            smem[t] = ck::math::exp(smem[t] - max_qk_acc);
+        }
     }
     __syncthreads();
 

From b157cbae0cba3973cffeac73de474f085713bc9e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 15 Jan 2024 22:48:53 +0000
Subject: [PATCH 359/641] address code review: add todo

---
 .../attention/hip_fmha/ck_attention_forward_decoder_splitk.h    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index e08fe6c08..419a36394 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -227,6 +227,8 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
 
     const int32_t lane_idx              = threadIdx.x;
     const int32_t wavefront_idx         = threadIdx.y;
+    // TODO: `threads_per_wavefront` and `wavefronts_per_block` may be compile time constants;
+    // investigate when optimizing
     const int32_t threads_per_wavefront = blockDim.x;
     const int32_t wavefronts_per_block  = blockDim.y;
     const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;

From 8581811e97f54c712c27ffa2849e54e4d0a9282b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 16 Jan 2024 19:12:10 +0000
Subject: [PATCH 360/641] address code review: shift LDS access by tt_low to
 avoid smem overbooking

---
 .../hip_fmha/attention_forward_splitk.cpp     | 45 ++++++++++++++++---
 .../ck_attention_forward_decoder_splitk.h     | 15 ++++---
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index cd399d0ec..3fad4afdd 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -615,6 +615,34 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
               lds_bytes(lds_bytes)
         {
         }
+
+        std::string str() const
+        {
+            std::ostringstream oss;
+            oss << "Argument { " << std::endl
+                << "    O: " << O << std::endl
+                << "    split_O: " << split_O << std::endl
+                << "    split_max: " << split_max << std::endl
+                << "    split_sumexp: " << split_sumexp << std::endl
+                << "    O_stride_b: " << O_stride_b << std::endl
+                << "    O_stride_m: " << O_stride_m << std::endl
+                << "    O_stride_g: " << O_stride_g << std::endl
+                << "    O_stride_h: " << O_stride_h << std::endl
+                << "    O_stride_split: " << O_stride_split << std::endl
+                << "    O_size_m: " << O_size_m << std::endl
+                << "    O_size_g: " << O_size_g << std::endl
+                << "    O_size_h: " << O_size_h << std::endl
+                << "    O_size_k: " << O_size_k << std::endl
+                << "    split_k: " << split_k << std::endl
+                << std::endl
+                << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z
+                << std::endl
+                << "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z
+                << std::endl
+                << "    lds_bytes: " << lds_bytes << std::endl
+                << "}";
+            return oss.str();
+        }
     };
 
     struct Invoker : public BaseInvoker
@@ -624,6 +652,9 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
         {
             auto threads_per_wavefront = arg.block_dim.x;
 
+            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ <<
+            // std::endl;
+
             auto O_size_k_alignment_necessary = 0;
 
             for(auto vec_size : {4, 2, 1})
@@ -831,10 +862,10 @@ at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_m
                 O_acc.size(3),
                 O_acc.size(4),
                 split_O_acc.stride(0),
-                split_O_acc.stride(1),
-                split_O_acc.stride(2),
-                split_O_acc.stride(3),
-                split_O_acc.stride(4),
+                O_acc.stride(0),
+                O_acc.stride(1),
+                O_acc.stride(2),
+                O_acc.stride(3),
                 split_k,
                 blocks,
                 threads,
@@ -914,12 +945,14 @@ static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, i
     double qk_scale = 1. / sqrt(XQ.size(-1));
     auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
         XQ, K, V, seqlen, qk_scale, split_k);
+    auto torch1_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
 
     auto hip_gold_mismatch = percent_mismatch(O_hip, gold_result);
     auto torch_gold_mismatch = percent_mismatch(O_torch, gold_result);
     auto hip_torch_mismatch = percent_mismatch(O_hip, O_torch);
-    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f hip_gold: %.2f torch_gold: %.2f \n", 
-        padding, batch_size, Hq, Hkv, split_k, hip_torch_mismatch, hip_gold_mismatch, torch_gold_mismatch);
+    auto gold_torch1_mismatch = percent_mismatch(gold_result, torch1_result);
+    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f hip_gold: %.2f torch_gold: %.2f torch1_gold: %.2f \n", 
+        padding, batch_size, Hq, Hkv, split_k, hip_torch_mismatch, hip_gold_mismatch, torch_gold_mismatch, gold_torch1_mismatch);
 }
 
 static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 419a36394..942d70e4a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -309,7 +309,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
             max_qk_acc   = ck::math::max(qk_acc, max_qk_acc);
             if(lane_idx == 0)
             {
-                smem[tt + ttt] = qk_acc;
+                smem[tt + ttt - tt_low] = qk_acc;
             }
         }
     }
@@ -347,7 +347,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                 // write accumulated sums to smem.
                 if(lane_idx == 0)
                 {
-                    smem[t] = qk_acc;
+                    smem[t - tt_low] = qk_acc;
                 }
             }
         }
@@ -378,7 +378,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     {
         if(t >= tt_low && t < tt_tail_high)
         {
-            softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+            softmax_denominator += ck::math::exp(smem[t - tt_low] - max_qk_acc);
         }
     }
     softmax_denominator =
@@ -410,7 +410,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
         if (t >= tt_low && t < tt_tail_high)
         {
             // softmax scale by sumexp will happen in the reduction kernel
-            smem[t] = ck::math::exp(smem[t] - max_qk_acc);
+            smem[t - tt_low] = ck::math::exp(smem[t - tt_low] - max_qk_acc);
         }
     }
     __syncthreads();
@@ -432,7 +432,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                 // load the V[b][t][g][h|0][:] row into registers, reusing K register
                 // storage
                 load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                ps[ttt] = smem[t];
+                ps[ttt] = smem[t - tt_low];
             }
 
 #pragma unroll n_loop_unroll
@@ -454,7 +454,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                     // storage
                     load_v<data_t, data_vec_t>(
                         cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                    ps[ttt] = smem[t];
+                    ps[ttt] = smem[t - tt_low];
                 }
             }
 
@@ -657,7 +657,8 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator
         using Argument = DeviceOp::Argument;
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-
+            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ << std::endl;
+            
             auto threads_per_wavefront = arg.block_dim.x;
 
             auto Q_size_k_alignment_necessary = 0;

From b1638ad988a1af1b8a33684929ee76bb39c94bbb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 16 Jan 2024 20:58:45 +0000
Subject: [PATCH 361/641] address code review: simplify reduction loops in
 split attention

---
 .../ck_attention_forward_decoder_splitk.h        | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 942d70e4a..e655cdfe5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -374,12 +374,9 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
 
     // each wavefront computes partial sum of exp.
     compute_t softmax_denominator = 0.0f;
-    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    for(int32_t t = tt_low + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
     {
-        if(t >= tt_low && t < tt_tail_high)
-        {
-            softmax_denominator += ck::math::exp(smem[t - tt_low] - max_qk_acc);
-        }
+        softmax_denominator += ck::math::exp(smem[t - tt_low] - max_qk_acc);
     }
     softmax_denominator =
         wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
@@ -405,13 +402,10 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     }
 
     // now, compute the normalization across all threads.
-    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
+    for(int32_t t = tt_low + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
     {
-        if (t >= tt_low && t < tt_tail_high)
-        {
-            // softmax scale by sumexp will happen in the reduction kernel
-            smem[t - tt_low] = ck::math::exp(smem[t - tt_low] - max_qk_acc);
-        }
+        // softmax scale by sumexp will happen in the reduction kernel
+        smem[t - tt_low] = ck::math::exp(smem[t - tt_low] - max_qk_acc);
     }
     __syncthreads();
 

From 10e76ab56c8f78b88206691b596f902624b88347 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 17 Jan 2024 15:39:48 +0000
Subject: [PATCH 362/641] Tiny update in ck-tiled forward kernel

---
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 24 ++++++++-----------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index a248f3525..034c0178e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -566,15 +566,13 @@ struct FmhaFwdKernel
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
                             left_size, right_size, kargs.seqlen_q, kargs.seqlen_k);
                     }
-                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft)
-                    {
-                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, true);
-                    }
-                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromBottomRight)
+                    else
                     {
+                        bool is_topleft =
+                            (kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft);
+
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, false);
+                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, is_topleft);
                     }
                 }
                 else
@@ -584,15 +582,13 @@ struct FmhaFwdKernel
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
                             -1, -1, kargs.seqlen_q, kargs.seqlen_k);
                     }
-                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft)
-                    {
-                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            -1, 0, kargs.seqlen_q, kargs.seqlen_k, true);
-                    }
-                    else if(kargs.mask_type == CausalMaskType::MaskUpperTriangleFromBottomRight)
+                    else
                     {
+                        bool is_topleft =
+                            (kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft);
+
                         res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            -1, 0, kargs.seqlen_q, kargs.seqlen_k, false);
+                            -1, 0, kargs.seqlen_q, kargs.seqlen_k, is_topleft);
                     }
                 }
 

From 67009e0acee5b3f3dbabff7b087a2282aeb1ca16 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jan 2024 18:29:25 +0000
Subject: [PATCH 363/641] address code review: merge for loops

---
 .../hip_fmha/ck_attention_forward_decoder_splitk.h       | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index e655cdfe5..5fffd02ba 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -449,15 +449,6 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                     load_v<data_t, data_vec_t>(
                         cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
                     ps[ttt] = smem[t - tt_low];
-                }
-            }
-
-#pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
                     o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
                 }
             }

From 8673fa9752a071d7ebe64fe43c003bfc37eaaa99 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:13:16 +0000
Subject: [PATCH 364/641] address code review: simplify coefficient pick

---
 .../csrc/attention/hip_fmha/attention_forward_splitk.cpp | 9 +++++----
 .../hip_fmha/ck_attention_forward_decoder_splitk.h       | 9 ++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 3fad4afdd..6abb09c8e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -78,14 +78,15 @@ split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const
         auto m_new = at::max(m_slice, m_current_max);
 
         auto pick_new = at::less(m_slice, m_current_max);
-        auto pick_our = at::logical_not(pick_new);
 
         auto log_alpha = at::neg(at::abs(at::sub(m_slice, m_current_max)));
         auto alpha = at::exp(log_alpha);
         alpha.nan_to_num_(1.);
-
-        O = at::add(O, at::add(O_slice, at::mul(at::add(at::mul(pick_our, O), at::mul(pick_new, O_slice)), at::sub(alpha, 1))));
-        l_current_sum = at::add(l_current_sum, at::add(l_slice, at::mul(at::add(at::mul(pick_our, l_current_sum), at::mul(pick_new, l_slice)), at::sub(alpha, 1))));
+        auto pick_current_coef = at::where(pick_new, 1., alpha);
+        auto pick_new_coef = at::where(pick_new, alpha, 1.);
+        O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, O_slice));
+        l_current_sum = at::add(at::mul(pick_current_coef, l_current_sum), at::mul(pick_new_coef, l_slice));
+        m_current_max = m_new;
     }
     
     return at::div(O, l_current_sum);
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 5fffd02ba..9f1d03b5e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -138,7 +138,6 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
     //       l_current_sum.isnan().any(), "l acc is nan" m_current_max = m_new
     //   out /= l_current_sum
 
-    compute_t new_max       = 0;
     compute_t global_sumexp = 0;
     compute_t global_max    = ck::NumericLimits<compute_t>::Lowest();
 
@@ -155,12 +154,12 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
         }
         compute_t local_max         = *(split_max + blockIdx.x * split_k + split_idx);
         compute_t local_sumexp      = *(split_sumexp + blockIdx.x * split_k + split_idx);
-        new_max                     = ck::math::max(local_max, global_max);
+        compute_t new_max           = ck::math::max(local_max, global_max);
         bool pick_new               = local_max < global_max;
         compute_t log_alpha         = -std::abs(local_max - global_max);
-        compute_t alpha             = isnan(log_alpha) ? compute_t{1} : ck::math::exp(log_alpha);
-        compute_t pick_current_coef = (1 + (1 - pick_new) * (alpha - 1));
-        compute_t pick_new_coef     = (1 + pick_new * (alpha - 1));
+        compute_t alpha             = isnan(log_alpha) ? compute_t{1.} : ck::math::exp(log_alpha);
+        compute_t pick_current_coef = pick_new ? 1. : alpha;
+        compute_t pick_new_coef     = pick_new ? alpha : 1.;
         global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
         global_O_compute.vec =
             pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;

From 3427dccea12674da1ff7f8a8b3a973aef7785544 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:24:12 +0000
Subject: [PATCH 365/641] fix runtime error message in testing code

---
 xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 6abb09c8e..23ec3cf6c 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -668,12 +668,12 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
 
             if(!O_size_k_alignment_necessary)
             {
-                throw std::runtime_error("Unsupported Q_size_k");
+                throw std::runtime_error("Unsupported O_size_k");
             }
 
             if(arg.O_size_k % O_size_k_alignment_necessary)
             {
-                throw std::runtime_error("Unsupported alignment for Q_size_k");
+                throw std::runtime_error("Unsupported alignment for O_size_k");
             }
 
             const dim3 reduce_gridsize         = {arg.grid_dim.x};

From 2e11d329fc27398ff202c6f8457093b03642fc82 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:32:29 +0000
Subject: [PATCH 366/641] fix split reduce test

---
 .../hip_fmha/attention_forward_splitk.cpp      | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 23ec3cf6c..dd305866f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -938,22 +938,14 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
 static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k) {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k);
-
-    auto O_torch = split_reduce_torch(O_ref, m_ref, l_ref, split_k);
-    auto O_hip = split_reduce_hip(O_ref, m_ref.squeeze(0), l_ref.squeeze(0), split_k);
+    auto [O_ref, m_ref, l_ref] = split_attention_hip(XQ, K, V, seqlen, split_k, /* wavefronts_per_block */ 1);
 
-    double qk_scale = 1. / sqrt(XQ.size(-1));
-    auto gold_result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
-        XQ, K, V, seqlen, qk_scale, split_k);
-    auto torch1_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
+    auto O_torch = split_reduce_torch(O_ref, m_ref.unsqueeze(0), l_ref.unsqueeze(0), split_k);
+    auto O_hip = split_reduce_hip(O_ref, m_ref, l_ref, split_k);
 
-    auto hip_gold_mismatch = percent_mismatch(O_hip, gold_result);
-    auto torch_gold_mismatch = percent_mismatch(O_torch, gold_result);
     auto hip_torch_mismatch = percent_mismatch(O_hip, O_torch);
-    auto gold_torch1_mismatch = percent_mismatch(gold_result, torch1_result);
-    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f hip_gold: %.2f torch_gold: %.2f torch1_gold: %.2f \n", 
-        padding, batch_size, Hq, Hkv, split_k, hip_torch_mismatch, hip_gold_mismatch, torch_gold_mismatch, gold_torch1_mismatch);
+    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f \n", 
+        padding, batch_size, Hq, Hkv, split_k, hip_torch_mismatch);
 }
 
 static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)

From dabc771db9f03a6afc5a4280ecbbda95ecfc071f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:43:34 +0000
Subject: [PATCH 367/641] address code review: fix smem offsets

---
 .../hip_fmha/ck_attention_forward_decoder_splitk.h   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 9f1d03b5e..bbb0da232 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -308,7 +308,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
             max_qk_acc   = ck::math::max(qk_acc, max_qk_acc);
             if(lane_idx == 0)
             {
-                smem[tt + ttt - tt_low] = qk_acc;
+                smem[tt + ttt - n_unrolled_loops * dtt * split_idx] = qk_acc;
             }
         }
     }
@@ -346,7 +346,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                 // write accumulated sums to smem.
                 if(lane_idx == 0)
                 {
-                    smem[t - tt_low] = qk_acc;
+                    smem[t - n_unrolled_loops * dtt * split_idx] = qk_acc;
                 }
             }
         }
@@ -375,7 +375,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     compute_t softmax_denominator = 0.0f;
     for(int32_t t = tt_low + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
     {
-        softmax_denominator += ck::math::exp(smem[t - tt_low] - max_qk_acc);
+        softmax_denominator += ck::math::exp(smem[t - n_unrolled_loops * dtt * split_idx] - max_qk_acc);
     }
     softmax_denominator =
         wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
@@ -404,7 +404,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     for(int32_t t = tt_low + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
     {
         // softmax scale by sumexp will happen in the reduction kernel
-        smem[t - tt_low] = ck::math::exp(smem[t - tt_low] - max_qk_acc);
+        smem[t - n_unrolled_loops * dtt * split_idx] = ck::math::exp(smem[t - n_unrolled_loops * dtt * split_idx] - max_qk_acc);
     }
     __syncthreads();
 
@@ -425,7 +425,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                 // load the V[b][t][g][h|0][:] row into registers, reusing K register
                 // storage
                 load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                ps[ttt] = smem[t - tt_low];
+                ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
             }
 
 #pragma unroll n_loop_unroll
@@ -447,7 +447,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                     // storage
                     load_v<data_t, data_vec_t>(
                         cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                    ps[ttt] = smem[t - tt_low];
+                    ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
                     o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
                 }
             }

From 6f1d5df0bd0ea7f6fa0c5378570d221fca477c3b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:47:03 +0000
Subject: [PATCH 368/641] remove redundant comment

---
 .../ck_attention_forward_decoder_splitk.h     | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index bbb0da232..87865db98 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -119,25 +119,6 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
         return;
     }
 
-    // for s in slices:
-    //       attn_slice = s["attn_slice"]
-    //       m = s["row_max"]
-    //       l = s["row_lse"]
-    //       m_new = torch.max(m, m_current_max)
-    //       assert not m_new.isnan().any(), "m_new is nan"
-    //       pick_new = m < m_current_max
-    //       pick_our = torch.logical_not(pick_new)
-
-    //       log_alpha = -torch.abs(m - m_current_max)
-    //       log_alpha[log_alpha.isnan()] = 0
-    //       alpha = torch.exp(log_alpha)
-    //       assert not alpha.isnan().any(), "alpha is nan"
-    //       out = out + attn_slice + (pick_our * out + pick_new * attn_slice) * (torch.sub(alpha,
-    //       1)) assert not out.isnan().any(), "out acc is nan" l_current_sum = l_current_sum + l +
-    //       (pick_our * l_current_sum + pick_new * l) * (torch.sub(alpha, 1)) assert not
-    //       l_current_sum.isnan().any(), "l acc is nan" m_current_max = m_new
-    //   out /= l_current_sum
-
     compute_t global_sumexp = 0;
     compute_t global_max    = ck::NumericLimits<compute_t>::Lowest();
 

From 8ee60d7f9470c4f469641385e73804743b08aff0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 18:36:13 +0000
Subject: [PATCH 369/641] address code review: initialize split attention
 workspace as empty

---
 .../csrc/attention/hip_fmha/attention_forward_splitk.cpp  | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index dd305866f..02f40bd8a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -258,11 +258,9 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     auto H = XQ.size(3);
     auto K = XQ.size(4);
 
-    auto O_splits = at::zeros({split_k, B, M, G, H, K}, XQ.options());
-
-    auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat))
-                         .fill_(ck::NumericLimits<float>::Lowest());
-    auto split_sumexp = at::zeros_like(split_max);
+    auto O_splits = at::empty({split_k, B, M, G, H, K}, XQ.options());
+    auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
+    auto split_sumexp = at::empty_like(split_max);
 
     efficient_attention_forward_decoder_splitk_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
         XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);

From ff985d23aeea78d42c6c96c3a9d48c509eeaf80e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 18:49:04 +0000
Subject: [PATCH 370/641] address code review: rename local vars

---
 .../hip_fmha/attention_forward_splitk.cpp     | 188 +++++++++---------
 1 file changed, 95 insertions(+), 93 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 02f40bd8a..b57110bfc 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -12,86 +12,6 @@ constexpr int32_t kWavefrontsPerBlock  = 1;
 constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
 } // namespace
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
-    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens, const int32_t split_k)
-{
-    auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
-
-    std::vector<at::Tensor> O_splits;
-    std::vector<at::Tensor> m_splits;
-    std::vector<at::Tensor> l_splits;
-
-    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
-        std::vector<at::Tensor> O_batch;
-        std::vector<at::Tensor> m_batch;
-        std::vector<at::Tensor> l_batch;
-
-        for(size_t b = 0; b < k_seqlens.numel(); ++b) {
-            auto seqlen = k_seqlens[b].item<int64_t>();
-            const int64_t t_low = split_idx * (seqlen / split_k);
-            const int64_t t_high = (split_idx + 1 < split_k) 
-                                 ? (1 + split_idx) * (seqlen / split_k)
-                                 : seqlen;
-
-            auto S        = at::einsum("mghk, nghk -> mghn",
-                            {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
-                            /* einsum eval path */ at::nullopt);
-            auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
-            auto s = at::exp(at::sub(S, m));
-            auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-            auto O        = at::einsum("mghn, nghk -> mghk", 
-                            {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)}, 
-                            /* einsum eval path */ at::nullopt);
-            O_batch.push_back(O);
-            m_batch.push_back(m);
-            l_batch.push_back(l);
-        }
-
-        auto O_cat = at::stack(O_batch);
-        auto m_cat = at::stack(m_batch);
-        auto l_cat = at::stack(l_batch);
-
-        O_splits.push_back(O_cat);
-        m_splits.push_back(m_cat);
-        l_splits.push_back(l_cat);
-    }
-
-    auto O_cat = at::stack(O_splits);
-    auto m_cat = at::transpose(at::stack(m_splits), 0, -1);
-    auto l_cat = at::transpose(at::stack(l_splits), 0, -1);
-
-    return std::make_tuple(O_cat, m_cat, l_cat);
-}
-
-static at::Tensor
-split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const at::Tensor& l_splits, int32_t split_k)
-{   
-    auto O = at::zeros_like(at::slice(O_splits, 0, 0, 1));
-    auto m_current_max = at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
-    auto l_current_sum = at::zeros_like(m_current_max);
-
-    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
-        auto O_slice = at::slice(O_splits, 0, split_idx, split_idx + 1);
-        auto m_slice = at::slice(m_splits, -1, split_idx, split_idx + 1);
-        auto l_slice = at::slice(l_splits, -1, split_idx, split_idx + 1);
-
-        auto m_new = at::max(m_slice, m_current_max);
-
-        auto pick_new = at::less(m_slice, m_current_max);
-
-        auto log_alpha = at::neg(at::abs(at::sub(m_slice, m_current_max)));
-        auto alpha = at::exp(log_alpha);
-        alpha.nan_to_num_(1.);
-        auto pick_current_coef = at::where(pick_new, 1., alpha);
-        auto pick_new_coef = at::where(pick_new, alpha, 1.);
-        O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, O_slice));
-        l_current_sum = at::add(at::mul(pick_current_coef, l_current_sum), at::mul(pick_new_coef, l_slice));
-        m_current_max = m_new;
-    }
-    
-    return at::div(O, l_current_sum);
-}
-
 namespace {
 
 template <typename c10_t>
@@ -268,18 +188,6 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     return O;
 }
 
-at::Tensor efficient_attention_forward_decoder_split1_torch(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
-    at::optional<at::Tensor> seq_kv_lens, // [B]
-    double qk_scale)
-{
-    auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, /*split_k*/ 1);
-    auto O               = split_reduce_torch(O_split, m, l, /*split_k*/ 1);
-    return O.reshape_as(XQ);
-}
-
 at::Tensor efficient_attention_forward_decoder_splitk_ck(
     const at::Tensor& XQ,                 // [B, 1, G, H, D]
     const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
@@ -333,6 +241,100 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m)
 
 // clang-format on
 
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
+    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens, const int32_t split_k)
+{
+    auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
+
+    std::vector<at::Tensor> O_splits;
+    std::vector<at::Tensor> m_splits;
+    std::vector<at::Tensor> l_splits;
+
+    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+        std::vector<at::Tensor> O_batch;
+        std::vector<at::Tensor> m_batch;
+        std::vector<at::Tensor> l_batch;
+
+        for(size_t b = 0; b < k_seqlens.numel(); ++b) {
+            auto seqlen = k_seqlens[b].item<int64_t>();
+            const int64_t t_low = split_idx * (seqlen / split_k);
+            const int64_t t_high = (split_idx + 1 < split_k) 
+                                 ? (1 + split_idx) * (seqlen / split_k)
+                                 : seqlen;
+
+            auto S        = at::einsum("mghk, nghk -> mghn",
+                            {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
+                            /* einsum eval path */ at::nullopt);
+            auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+            auto s = at::exp(at::sub(S, m));
+            auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+            auto O        = at::einsum("mghn, nghk -> mghk", 
+                            {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)}, 
+                            /* einsum eval path */ at::nullopt);
+            O_batch.push_back(O);
+            m_batch.push_back(m);
+            l_batch.push_back(l);
+        }
+
+        auto O_cat = at::stack(O_batch);
+        auto m_cat = at::stack(m_batch);
+        auto l_cat = at::stack(l_batch);
+
+        O_splits.push_back(O_cat);
+        m_splits.push_back(m_cat);
+        l_splits.push_back(l_cat);
+    }
+
+    auto O_cat = at::stack(O_splits);
+    auto m_cat = at::transpose(at::stack(m_splits), 0, -1);
+    auto l_cat = at::transpose(at::stack(l_splits), 0, -1);
+
+    return std::make_tuple(O_cat, m_cat, l_cat);
+}
+
+static at::Tensor
+split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const at::Tensor& l_splits, int32_t split_k)
+{   
+    auto O = at::zeros_like(at::slice(O_splits, 0, 0, 1));
+    auto global_max = at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
+    auto global_sumexp = at::zeros_like(global_max);
+
+    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+        auto local_O = at::slice(O_splits, 0, split_idx, split_idx + 1);
+        auto local_max = at::slice(m_splits, -1, split_idx, split_idx + 1);
+        auto local_sumexp = at::slice(l_splits, -1, split_idx, split_idx + 1);
+
+        auto new_max = at::max(local_max, global_max);
+
+        auto pick_new = at::less(local_max, global_max);
+
+        auto log_alpha = at::neg(at::abs(at::sub(local_max, global_max)));
+        auto alpha = at::exp(log_alpha);
+        alpha.nan_to_num_(1.);
+        auto pick_current_coef = at::where(pick_new, 1., alpha);
+        auto pick_new_coef = at::where(pick_new, alpha, 1.);
+        O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
+        global_sumexp = at::add(at::mul(pick_current_coef, global_sumexp), at::mul(pick_new_coef, local_sumexp));
+        global_max = new_max;
+    }
+    
+    return at::div(O, global_sumexp);
+}
+
+static at::Tensor 
+efficient_attention_forward_decoder_splitk_torch(
+    const at::Tensor& XQ,                 // [B, 1, G, H, D]
+    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale,
+    int32_t split_k)
+{
+    auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, split_k);
+    auto O               = split_reduce_torch(O_split, m, l, split_k);
+    return O.reshape_as(XQ);
+}
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -954,7 +956,7 @@ static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_s
 
     auto result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
         XQ, K, V, seqlen, qk_scale, split_k);
-    auto gold_result = efficient_attention_forward_decoder_split1_torch(XQ, K, V, seqlen, qk_scale);
+    auto gold_result = efficient_attention_forward_decoder_splitk_torch(XQ, K, V, seqlen, qk_scale, /* split_k */ 1);
     auto e2e_mismatch = percent_mismatch(result, gold_result);
     printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, e2e_mismatch);
 }

From d7132b9425b4c81b2cadbd7cafaa1d0cda6e2fa0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 19:23:13 +0000
Subject: [PATCH 371/641] address code review: remove unused _rand_seqlens

---
 tests/test_mem_eff_attention.py    | 39 ------------------------------
 tests/test_mem_eff_attention_ck.py | 38 -----------------------------
 2 files changed, 77 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 773d8a5c8..2f4857535 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -379,45 +379,6 @@ def compute_attention_split(q, k_slice, v_slice, attn_bias_slice):
     return out
 
 
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
     idx = {0, total}
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 5ee0ab2df..56311a395 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -426,44 +426,6 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     assert not out.isnan().any(), "final out is nan"
     return out
 
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
 
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total

From f4d5263af126dfb6cece1f105a6ac63937c16bb0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 19:43:54 +0000
Subject: [PATCH 372/641] address code review: cleanup python tests

---
 tests/test_mem_eff_attention.py    | 69 ------------------------------
 tests/test_mem_eff_attention_ck.py | 49 ++++++++-------------
 2 files changed, 18 insertions(+), 100 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 2f4857535..a1ca3b089 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -310,75 +310,6 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2) -> torch.Tensor:
-    assert q.ndim == 3
-
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    if scale is None:
-        scale = torch.rsqrt(q.shape[-1])
-    q = q * scale
-
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-
-    split_config = { "dim": -1, "split_size_or_sections": k.size(-1) // split_k}
-    k_split = torch.split(k, **split_config)
-    v_split = torch.split(v, **split_config)
-    attn_bias_split = torch.split(attn_bias_tensor, **split_config)
-
-    def compute_attention_split(q, k_slice, v_slice, attn_bias_slice):
-        p_slice = q @ k_slice.transpose(-2, -1)
-        p_slice += attn_bias_slice
-        m = p_slice.max(dim = -1)
-        s = torch.exp(p_slice - m[:, :, None])
-        l = torch.sum(s, dim = -1)
-        attn_slice = s @ v_slice
-        return {
-            "attn_slice": attn_slice,
-            "row_max": m,
-            "row_lse": l, 
-        }
-    
-    slices = map(lambda k, v, b: compute_attention_split(q, k, v, b), 
-        zip(k_split, v_split, attn_bias_split))
-    slices = list(slices)
-    out = torch.zero_like(q)
-
-    m_current_max = slices[0]["row_max"]
-    l_current_sum = torch.zero_like(slices[0]["row_lse"])
-
-    for s in slices:
-        (attn_slice, m, l) = s.values()
-        m_new = torch.max(m, m_current_max)
-        pick_new = m < m_current_max
-        pick_our = torch.logical_not(pick_new)
-
-        alpha = torch.exp(-torch.abs(m - m_current_max))
-
-        out = (pick_our * out + pick_new * attn_slice) * alpha
-        l_current_sum = (pick_our * l_current_sum + pick_new * l) * alpha
-        m_current_max = m_new
-    
-    out /= l_current_sum
-    return out
-
-
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
     idx = {0, total}
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 56311a395..e43221dd2 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -368,25 +368,14 @@ def attn_bias_group(group: int):
     attn_bias_split = torch.split(attn_bias_tensor, dim=-1, split_size_or_sections=split_size)
     
     def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
-        assert not q_whole.isnan().any(), "q_whole is nan"
-        assert not k_slice.isnan().any(), "k_slice is nan"
         p_slice = q_whole @ k_slice.transpose(-2, -1)
-        assert not p_slice.isnan().any(), "p_slice is nan"
-        assert not p_slice.isinf().any(), "p_slice is inf"
         p_slice += attn_bias_slice
-        assert not p_slice.isnan().any(), "p_slice is nan after bias add"
         m = torch.max(p_slice, dim = -1, keepdim=True).values
-        assert not m.isnan().any(), "m is nan"
         p_slice_scaled = p_slice - m
         p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
-        assert not p_slice_scaled.isnan().any(), f"p_slice_scaled is nan: {p_slice_scaled.isnan().sum()} of {p_slice_scaled.numel()} values"
         s = torch.exp(p_slice_scaled)
-        assert s.shape == p_slice.shape
-        assert not s.isnan().any(), f"s is nan: {s.isnan().sum()} of {s.numel()} values"
         l = torch.sum(s, dim=-1, keepdim=True)
-        assert not l.isnan().any(), "l is nan"
         attn_slice = s @ v_slice
-        assert not attn_slice.isnan().any(), "attn_slice is nan"
         return {
             "attn_slice": attn_slice,
             "row_max": m,
@@ -401,29 +390,27 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
     # reduce out over split-k slices
 
-    m_current_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
-    l_current_sum = torch.zeros_like(slices[0]["row_lse"])
+    global_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
+    global_sumexp = torch.zeros_like(slices[0]["row_lse"])
 
     for s in slices:
-        attn_slice = s["attn_slice"]
-        m = s["row_max"]
-        l = s["row_lse"]
-        m_new = torch.max(m, m_current_max)
-        assert not m_new.isnan().any(), "m_new is nan"
-        pick_new = m < m_current_max
-        pick_our = torch.logical_not(pick_new)
-
-        log_alpha = -torch.abs(m - m_current_max)
-        log_alpha[log_alpha.isnan()] = 0
+        local_out = s["attn_slice"]
+        local_max = s["row_max"]
+        local_sumexp = s["row_lse"]
+        new_max = torch.max(local_max, global_max)
+
+        log_alpha = -torch.abs(local_max - global_max)
         alpha = torch.exp(log_alpha)
-        assert not alpha.isnan().any(), "alpha is nan"
-        out = out + attn_slice + (pick_our * out + pick_new * attn_slice) * (torch.sub(alpha, 1))
-        assert not out.isnan().any(), "out acc is nan"
-        l_current_sum = l_current_sum + l + (pick_our * l_current_sum + pick_new * l) * (torch.sub(alpha, 1))
-        assert not l_current_sum.isnan().any(), "l acc is nan"
-        m_current_max = m_new
-    out /= l_current_sum
-    assert not out.isnan().any(), "final out is nan"
+        alpha.nan_to_num_(1.)
+
+        pick_new = local_max < global_max
+        new_coef = torch.where(pick_new, alpha, 1.)
+        curr_coef = torch.where(pick_new, 1., alpha)
+
+        out = out * curr_coef + local_out * new_coef
+        global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
+        global_max = new_max
+    out /= global_sumexp
     return out
 
 
From d81285a78e94be28e52eb6e8695372db6b23642d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 19:51:21 +0000
Subject: [PATCH 373/641] remove redundant new_max local var

---
 tests/test_mem_eff_attention_ck.py                       | 3 +--
 .../csrc/attention/hip_fmha/attention_forward_splitk.cpp | 9 ++++-----
 .../hip_fmha/ck_attention_forward_decoder_splitk.h       | 8 +++++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index e43221dd2..8c0d07f41 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -397,7 +397,6 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         local_out = s["attn_slice"]
         local_max = s["row_max"]
         local_sumexp = s["row_lse"]
-        new_max = torch.max(local_max, global_max)
 
         log_alpha = -torch.abs(local_max - global_max)
         alpha = torch.exp(log_alpha)
@@ -409,7 +408,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
         out = out * curr_coef + local_out * new_coef
         global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
-        global_max = new_max
+        global_max = torch.max(local_max, global_max)
     out /= global_sumexp
     return out
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index b57110bfc..5f1d5cde2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -304,18 +304,17 @@ split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const
         auto local_max = at::slice(m_splits, -1, split_idx, split_idx + 1);
         auto local_sumexp = at::slice(l_splits, -1, split_idx, split_idx + 1);
 
-        auto new_max = at::max(local_max, global_max);
-
-        auto pick_new = at::less(local_max, global_max);
-
         auto log_alpha = at::neg(at::abs(at::sub(local_max, global_max)));
         auto alpha = at::exp(log_alpha);
         alpha.nan_to_num_(1.);
+
+        auto pick_new = at::less(local_max, global_max);
         auto pick_current_coef = at::where(pick_new, 1., alpha);
         auto pick_new_coef = at::where(pick_new, alpha, 1.);
+        
         O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
         global_sumexp = at::add(at::mul(pick_current_coef, global_sumexp), at::mul(pick_new_coef, local_sumexp));
-        global_max = new_max;
+        global_max = at::max(local_max, global_max);
     }
     
     return at::div(O, global_sumexp);
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 87865db98..20d9ede81 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -135,16 +135,18 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
         }
         compute_t local_max         = *(split_max + blockIdx.x * split_k + split_idx);
         compute_t local_sumexp      = *(split_sumexp + blockIdx.x * split_k + split_idx);
-        compute_t new_max           = ck::math::max(local_max, global_max);
-        bool pick_new               = local_max < global_max;
+        
         compute_t log_alpha         = -std::abs(local_max - global_max);
         compute_t alpha             = isnan(log_alpha) ? compute_t{1.} : ck::math::exp(log_alpha);
+        
+        bool pick_new               = local_max < global_max;
         compute_t pick_current_coef = pick_new ? 1. : alpha;
         compute_t pick_new_coef     = pick_new ? alpha : 1.;
+        
         global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
         global_O_compute.vec =
             pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
-        global_max = new_max;
+        global_max = ck::math::max(local_max, global_max);
     }
     global_O_compute.vec /= global_sumexp;
 #pragma unroll

From eba46f112083d3679ce40a00fa542be0bdf58f47 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 19:59:01 +0000
Subject: [PATCH 374/641] address code review: rename seq_acc

---
 .../attention/hip_fmha/attention_forward_splitk.cpp    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 5f1d5cde2..6d557fb17 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -112,7 +112,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
             auto split_O_acc =
                 split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
             auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto seq_acc =
+            auto seq_acc_ptr =
                 seq_kv_lens
                     ? seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data()
                     : nullptr;
@@ -127,7 +127,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
                 reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
                 split_max_acc.data(),
                 split_sumexp_acc.data(),
-                seq_acc,
+                seq_acc_ptr,
                 XQ_acc.stride(0),
                 XQ_acc.stride(1),
                 XQ_acc.stride(2),
@@ -311,7 +311,7 @@ split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const
         auto pick_new = at::less(local_max, global_max);
         auto pick_current_coef = at::where(pick_new, 1., alpha);
         auto pick_new_coef = at::where(pick_new, alpha, 1.);
-        
+
         O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
         global_sumexp = at::add(at::mul(pick_current_coef, global_sumexp), at::mul(pick_new_coef, local_sumexp));
         global_max = at::max(local_max, global_max);
@@ -767,7 +767,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
             auto split_O_acc =
                 split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
             auto O_acc   = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto seq_acc = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data();
+            auto seq_acc = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
             auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
             auto split_sumexp_acc =
                 split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
@@ -779,7 +779,7 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
                 reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
                 split_max_acc.data(),
                 split_sumexp_acc.data(),
-                seq_acc,
+                seq_acc.data(),
                 XQ_acc.stride(0),
                 XQ_acc.stride(1),
                 XQ_acc.stride(2),

From 7f9ce55c3590ec1463d64598fe45ee2793417556 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 21:24:18 +0000
Subject: [PATCH 375/641] re-enable loop unroll; adjust tests to handle splits
 with size divisible by block size; handle empty splits correctly

---
 .../hip_fmha/attention_forward_splitk.cpp     | 23 ++++++++++---------
 .../ck_attention_forward_decoder_splitk.h     |  4 ++--
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 6d557fb17..d095a51ba 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -242,7 +242,7 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m)
 // clang-format on
 
 static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
-    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens, const int32_t split_k)
+    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens, const int32_t split_k, const int32_t block_size)
 {
     auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
 
@@ -257,17 +257,17 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
 
         for(size_t b = 0; b < k_seqlens.numel(); ++b) {
             auto seqlen = k_seqlens[b].item<int64_t>();
-            const int64_t t_low = split_idx * (seqlen / split_k);
+            const int64_t t_low = split_idx * (seqlen / split_k / block_size) * block_size;
             const int64_t t_high = (split_idx + 1 < split_k) 
-                                 ? (1 + split_idx) * (seqlen / split_k)
+                                 ? (1 + split_idx) * (seqlen / split_k / block_size) * block_size
                                  : seqlen;
 
             auto S        = at::einsum("mghk, nghk -> mghn",
                             {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
                             /* einsum eval path */ at::nullopt);
-            auto m = std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+            auto m = S.numel() > 0 ? std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true)) : at::empty_like(at::slice(S, -1, 0, 1)).fill_(ck::NumericLimits<float>::Lowest());
             auto s = at::exp(at::sub(S, m));
-            auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+            auto l = s.numel() > 0 ? at::sum(s, /* dim */ -1, /* keepdim */ true) : at::zeros_like(m);
             auto O        = at::einsum("mghn, nghk -> mghk", 
                             {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)}, 
                             /* einsum eval path */ at::nullopt);
@@ -281,8 +281,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
         auto l_cat = at::stack(l_batch);
 
         O_splits.push_back(O_cat);
-        m_splits.push_back(m_cat);
-        l_splits.push_back(l_cat);
+        m_splits.push_back(m_cat.numel() > 0 ? m_cat : at::empty_like(at::slice(O_cat, -1, 0, 1)).fill_(ck::NumericLimits<float>::Lowest()));
+        l_splits.push_back(l_cat.numel() > 0 ? l_cat : at::zeros_like(at::slice(O_cat, -1, 0, 1)));
     }
 
     auto O_cat = at::stack(O_splits);
@@ -327,9 +327,10 @@ efficient_attention_forward_decoder_splitk_torch(
     const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
-    int32_t split_k)
+    int32_t split_k,
+    int32_t block_size)
 {
-    auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, split_k);
+    auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, split_k, block_size);
     auto O               = split_reduce_torch(O_split, m, l, split_k);
     return O.reshape_as(XQ);
 }
@@ -915,7 +916,7 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k);
+    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k, /* block_size */ 16);
 
     auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, split_k, /* wavefronts_per_block */ 1);
 
@@ -955,7 +956,7 @@ static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_s
 
     auto result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
         XQ, K, V, seqlen, qk_scale, split_k);
-    auto gold_result = efficient_attention_forward_decoder_splitk_torch(XQ, K, V, seqlen, qk_scale, /* split_k */ 1);
+    auto gold_result = efficient_attention_forward_decoder_splitk_torch(XQ, K, V, seqlen, qk_scale, /* split_k */ 1, /* block_size */ 1);
     auto e2e_mismatch = percent_mismatch(result, gold_result);
     printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, e2e_mismatch);
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 20d9ede81..a4c61f127 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -162,8 +162,8 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
 
 template <typename scalar_t,
           int32_t vec_size           = 4,
-          int32_t n_loop_unroll      = 1,
-          int32_t n_loop_unroll_tail = 1,
+          int32_t n_loop_unroll      = 16,
+          int32_t n_loop_unroll_tail = 2,
           int32_t KV_M_MAX           = 8192,
           typename compute_t         = float>
 __global__ void

From f888b88f8fe80e7fcd32c7729b543ea5a98a7205 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 18 Jan 2024 23:38:39 +0000
Subject: [PATCH 376/641] test a wider range of split-k in cpp tests; fix torch
 implementation one more time to handle empty splits

---
 .../hip_fmha/attention_forward_splitk.cpp     | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index d095a51ba..8ac38a440 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -262,15 +262,22 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
                                  ? (1 + split_idx) * (seqlen / split_k / block_size) * block_size
                                  : seqlen;
 
+            const bool empty = t_low == t_high;
+
             auto S        = at::einsum("mghk, nghk -> mghn",
                             {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
                             /* einsum eval path */ at::nullopt);
-            auto m = S.numel() > 0 ? std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true)) : at::empty_like(at::slice(S, -1, 0, 1)).fill_(ck::NumericLimits<float>::Lowest());
+            auto m = empty ? at::empty_like(S) : std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
             auto s = at::exp(at::sub(S, m));
-            auto l = s.numel() > 0 ? at::sum(s, /* dim */ -1, /* keepdim */ true) : at::zeros_like(m);
+            auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
             auto O        = at::einsum("mghn, nghk -> mghk", 
                             {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)}, 
                             /* einsum eval path */ at::nullopt);
+            if (empty) {
+                m = at::empty_like(at::slice(O, -1, 0, 1));
+                l = at::zeros_like(m);
+                m.fill_(ck::NumericLimits<float>::Lowest());
+            }
             O_batch.push_back(O);
             m_batch.push_back(m);
             l_batch.push_back(l);
@@ -281,8 +288,8 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
         auto l_cat = at::stack(l_batch);
 
         O_splits.push_back(O_cat);
-        m_splits.push_back(m_cat.numel() > 0 ? m_cat : at::empty_like(at::slice(O_cat, -1, 0, 1)).fill_(ck::NumericLimits<float>::Lowest()));
-        l_splits.push_back(l_cat.numel() > 0 ? l_cat : at::zeros_like(at::slice(O_cat, -1, 0, 1)));
+        m_splits.push_back(m_cat);
+        l_splits.push_back(l_cat);
     }
 
     auto O_cat = at::stack(O_splits);
@@ -924,6 +931,10 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
     auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
     auto l_percent_mismatch = percent_mismatch(l_ref, l_hip);
 
+    // if (m_percent_mismatch > 0) {
+    //     std::cout << "ref: " << m_ref << std::endl << "hip: " << m_hip << std::endl;
+    // }
+
     printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
             padding,
             batch_size,
@@ -969,7 +980,7 @@ int main(int argc, char** argv)
             for (auto batch_size : {1, 8}) {
                 for (auto Hq : { 16 }) {
                     for (auto Hkv : { 16 }) {
-                        for (auto split_k : {1, 2, 4}) {
+                        for (auto split_k : {1, 2, 4, 8, 16}) {
                             test_splitk_decoder_e2e_correctness(padding, batch_size, Hq, Hkv, split_k);
                         }
                     }
@@ -981,7 +992,7 @@ int main(int argc, char** argv)
             for (auto batch_size : {1, 8}) {
                 for (auto Hq : { 16 }) {
                     for (auto Hkv : { 16 }) {
-                        for (auto split_k : {1, 2}) {
+                        for (auto split_k : {1, 2, 4, 8, 16}) {
                             test_split_attention(padding, batch_size, Hq, Hkv, split_k);
                         }
                     }

From bad053fc1cb36204bd287bb21d6b371fc0e2e16d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 19 Jan 2024 19:45:44 +0000
Subject: [PATCH 377/641] Synchronize with ck-tiled update to support
 head-dim-256 and LSE storing

---
 tests/test_forward_ck_tiled.py                |   4 +-
 tests/test_mem_eff_attention_ck.py            |   4 +-
 third_party/composable_kernel_tiled           |   2 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  57 ++++---
 .../hip_fmha/ck_tiled_fmha_definitions.h      |  22 +++
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 145 +++++++++++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  38 +++--
 7 files changed, 203 insertions(+), 69 deletions(-)

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
index e76f52e09..1484deaae 100644
--- a/tests/test_forward_ck_tiled.py
+++ b/tests/test_forward_ck_tiled.py
@@ -437,8 +437,8 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if k > 128 or kv > 128:
-        pytest.skip("k or kv bigger than 128 is not supported by CK-FlashAttention")
+    if k > 256 or kv > 256:
+        pytest.skip("head-dim size bigger than 256 is not supported by CK-FlashAttention")
 
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index ee9c557ab..2caf187be 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -437,8 +437,8 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+    if k > 256 or kv > 256:
+        pytest.skip("head-dim size bigger than 256 is not supported by CK-FlashAttention")
 
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index cd4c0600f..73166db69 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit cd4c0600f37288f09736d910378efeb18a8c4142
+Subproject commit 73166db6920afac53189098acf4774f9fa929143
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 8131ae37f..122e415ee 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -18,11 +18,9 @@
 #include <ck/tensor/tensor_view.hpp>
 
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs_default_policy.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 #include <ck/tile_program/block_tile/block_masking.hpp>
@@ -60,6 +58,11 @@ struct batched_infer_causalmask_attnbias_dispatched
             constexpr ck::index_t CONST_NAME = 128;                         \
             __VA_ARGS__();                                                  \
         }                                                                   \
+        else if(HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256)                       \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 256;                         \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
         else                                                                \
         {                                                                   \
             throw std::runtime_error("Head-dim sizes not supported!");      \
@@ -75,6 +78,7 @@ struct batched_infer_causalmask_attnbias_dispatched
         typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
         typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
         typename FmhaFwdTypeConfig<scalar_t>::PDataType,
         typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::ODataType,
@@ -119,19 +123,16 @@ struct batched_infer_causalmask_attnbias_dispatched
                                                                             kN0K1NeedPadding,
                                                                             kK0N1NeedPadding,
                                                                             has_attn_bias,
+                                                                            false, // kStoreLSE
                                                                             occupancy>;
 
                         using FmhaPipelineProblem =
                             FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
 
-                        constexpr bool no_any_padding =
-                            !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
-
-                        if constexpr(no_any_padding)
+                        if constexpr(HDim == 256)
                         {
-                            using FmhaPipeline =
-                                ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                                    FmhaPipelineProblem>;
+                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                                FmhaPipelineProblem>;
                             using FmhaKernel =
                                 FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
@@ -139,12 +140,29 @@ struct batched_infer_causalmask_attnbias_dispatched
                         }
                         else
                         {
-                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                                FmhaPipelineProblem>;
-                            using FmhaKernel =
-                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                            RunWithKernel<FmhaKernel>(param, stream);
+                            constexpr bool no_any_padding =
+                                !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
+
+                            if constexpr(no_any_padding)
+                            {
+                                using FmhaPipeline =
+                                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                                        FmhaPipelineProblem>;
+                                using FmhaKernel =
+                                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                                RunWithKernel<FmhaKernel>(param, stream);
+                            }
+                            else
+                            {
+                                using FmhaPipeline =
+                                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                                        FmhaPipelineProblem>;
+                                using FmhaKernel =
+                                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                                RunWithKernel<FmhaKernel>(param, stream);
+                            };
                         };
                     });
             });
@@ -160,6 +178,7 @@ struct batched_infer_causalmask_attnbias_dispatched
                 param.k_ptr,
                 param.v_ptr,
                 param.attn_bias_ptr,
+                nullptr, // lse_ptr
                 param.out_ptr,
                 param.M,              // seqlen_q
                 param.N,              // seqlen_k
@@ -172,15 +191,17 @@ struct batched_infer_causalmask_attnbias_dispatched
                 param.v_strides[1],
                 param.attn_bias_strides[2],
                 param.out_strides[1],
-                param.q_strides[2], // q, k, v, bias, out tensor head-dim stride
+                param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
                 param.k_strides[2],
                 param.v_strides[2],
                 param.attn_bias_strides[1],
+                0, // nhead_stride_lse
                 param.out_strides[2],
-                param.q_strides[0], // q, k, v, bias, out tensor batch-dim stride
+                param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
                 param.k_strides[0],
                 param.v_strides[0],
                 param.attn_bias_strides[0],
+                0, // batch_stride_lse
                 param.out_strides[0],
                 static_cast<CausalMaskType>(param.custom_mask_type),
                 param.window_size);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index 0129ac082..624efa70d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -6,6 +6,8 @@
  */
 #pragma once
 
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+
 enum struct CausalMaskType
 {
     MaskDisabled,
@@ -23,6 +25,7 @@ struct FmhaFwdTypeConfig<ck::half_t>
     using KDataType           = ck::half_t;
     using VDataType           = ck::half_t;
     using BiasDataType        = ck::half_t;
+    using LSEDataType         = float;      // data type for lse(logsumexp L_j = max_j + log(l_j))
     using SaccDataType        = float;      // data type for first gemm accumulation
     using SMPLComputeDataType = float;      // data type for reduction, softmax
     using PDataType           = ck::half_t; // data type for A matrix of second gemm
@@ -37,6 +40,7 @@ struct FmhaFwdTypeConfig<ck::bhalf_t>
     using KDataType           = ck::bhalf_t;
     using VDataType           = ck::bhalf_t;
     using BiasDataType        = ck::bhalf_t;
+    using LSEDataType         = float;       // data type for lse(logsumexp L_j = max_j + log(l_j))
     using SaccDataType        = float;       // data type for first gemm accumulation
     using SMPLComputeDataType = float;       // data type for reduction, softmax
     using PDataType           = ck::bhalf_t; // data type for A matrix of second gemm
@@ -54,17 +58,25 @@ struct FmhaFwdBlockTile<32>
 {
     using type = ck::Sequence<128, 64, 16, 32, 32, 32>;
 };
+
 template <>
 struct FmhaFwdBlockTile<64>
 {
     using type = ck::Sequence<128, 64, 32, 64, 32, 64>;
 };
+
 template <>
 struct FmhaFwdBlockTile<128>
 {
     using type = ck::Sequence<128, 128, 32, 128, 32, 128>;
 };
 
+template <>
+struct FmhaFwdBlockTile<256>
+{
+    using type = ck::Sequence<128, 128, 32, 256, 32, 256>;
+};
+
 using FmhaFwdBlockWarps = ck::Sequence<4, 1, 1>;
 using FmhaFwdWarpTile   = ck::Sequence<32, 32, 16>;
 
@@ -100,3 +112,13 @@ struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlock
                                                            FmhaFwdVLayout>
 {
 };
+
+template <>
+struct FmhaFwdShape<256> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<256>::type,
+                                                           FmhaFwdBlockWarps,
+                                                           FmhaFwdWarpTile,
+                                                           FmhaFwdBlockWarps,
+                                                           FmhaFwdWarpTile,
+                                                           FmhaFwdVLayout>
+{
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 034c0178e..acabd1e7a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -34,6 +34,7 @@ struct FmhaFwdKernel
     using KDataType    = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
     using VDataType    = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
     using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
+    using LSEDataType  = ck::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
     using ODataType    = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
 
     using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
@@ -43,27 +44,24 @@ struct FmhaFwdKernel
     static constexpr bool kN0K1NeedPadding = FmhaPipeline::kN0K1NeedPadding;
     static constexpr bool kK0N1NeedPadding = FmhaPipeline::kK0N1NeedPadding;
     static constexpr bool kHasBias         = FmhaPipeline::kHasBias;
+    static constexpr bool kStoreLSE        = FmhaPipeline::kStoreLSE;
     using FmhaMask                         = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask         = FmhaMask::IsMasking;
 
-    // using C0MatrixMask = ck::tile_program::block::C0MatrixMask_impl<
-    //     ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>>;
-
-    private:
     template <ck::index_t I> // to avoid duplicated base class prblem, introduce an template arg
-    struct EmptyKargs
+    struct FmhaFwdEmptyKargs
     {
     };
 
     // kargs use aggregate initializer, so no constructor will provided
     // use inheritance to minimize karg size
     // user need to use MakeKargs() function to create kargs.
-    struct CommonKargs
+    struct FmhaFwdCommonKargs
     {
-        const QDataType* q_ptr;
-        const KDataType* k_ptr;
-        const VDataType* v_ptr;
-        ODataType* o_ptr;
+        const void* q_ptr;
+        const void* k_ptr;
+        const void* v_ptr;
+        void* o_ptr;
 
         ck::index_t seqlen_q;
         ck::index_t seqlen_k;
@@ -86,27 +84,40 @@ struct FmhaFwdKernel
         ck::index_t nhead_stride_o;
     };
 
-    struct CommonBiasKargs
+    struct FmhaFwdCommonBiasKargs
     {
-        const BiasDataType* bias_ptr  = nullptr;
+        const void* bias_ptr          = nullptr;
         ck::index_t stride_bias       = 0;
         ck::index_t nhead_stride_bias = 0;
     };
 
-    struct BatchModeBiasKargs : CommonBiasKargs
+    struct FmhaFwdBatchModeBiasKargs : FmhaFwdCommonBiasKargs
     {
         ck::index_t batch_stride_bias = 0;
     };
 
-    struct MaskKargs
+    struct FmhaFwdMaskKargs
     {
         CausalMaskType mask_type;
         ck::index_t window_size;
     };
 
-    struct BatchModeKargs : CommonKargs,
-                            std::conditional_t<kHasBias, BatchModeBiasKargs, EmptyKargs<0>>,
-                            std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>
+    struct FmhaFwdCommonLSEKargs
+    {
+        void* lse_ptr                = nullptr;
+        ck::index_t nhead_stride_lse = 0;
+    };
+
+    struct FmhaFwdBatchModeLSEKargs : FmhaFwdCommonLSEKargs
+    {
+        ck::index_t batch_stride_lse = 0;
+    };
+
+    struct FmhaFwdBatchModeKargs
+        : FmhaFwdCommonKargs,
+          std::conditional_t<kHasBias, FmhaFwdBatchModeBiasKargs, FmhaFwdEmptyKargs<0>>,
+          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+          std::conditional_t<kStoreLSE, FmhaFwdBatchModeLSEKargs, FmhaFwdEmptyKargs<2>>
     {
         ck::index_t batch_stride_q;
         ck::index_t batch_stride_k;
@@ -114,23 +125,25 @@ struct FmhaFwdKernel
         ck::index_t batch_stride_o;
     };
 
-    struct GroupModeKargs : CommonKargs,
-                            std::conditional_t<kHasBias, CommonBiasKargs, EmptyKargs<0>>,
-                            std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>
+    struct FmhaFwdGroupModeKargs
+        : FmhaFwdCommonKargs,
+          std::conditional_t<kHasBias, FmhaFwdCommonBiasKargs, FmhaFwdEmptyKargs<0>>,
+          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+          std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
         const int32_t* seqlen_k_ptr;
     };
 
-    public:
-    using Kargs = std::conditional_t<kIsGroupMode, GroupModeKargs, BatchModeKargs>;
+    using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
 
     template <bool Cond = !kIsGroupMode>
     __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
                                                                       const void* k_ptr,
                                                                       const void* v_ptr,
                                                                       const void* bias_ptr,
+                                                                      void* lse_ptr,
                                                                       void* o_ptr,
                                                                       ck::index_t seqlen_q,
                                                                       ck::index_t seqlen_k,
@@ -147,19 +160,21 @@ struct FmhaFwdKernel
                                                                       ck::index_t nhead_stride_k,
                                                                       ck::index_t nhead_stride_v,
                                                                       ck::index_t nhead_stride_bias,
+                                                                      ck::index_t nhead_stride_lse,
                                                                       ck::index_t nhead_stride_o,
                                                                       ck::index_t batch_stride_q,
                                                                       ck::index_t batch_stride_k,
                                                                       ck::index_t batch_stride_v,
                                                                       ck::index_t batch_stride_bias,
+                                                                      ck::index_t batch_stride_lse,
                                                                       ck::index_t batch_stride_o,
                                                                       CausalMaskType mask_type,
                                                                       ck::index_t window_size)
     {
-        Kargs kargs{{reinterpret_cast<const QDataType*>(q_ptr),
-                     reinterpret_cast<const KDataType*>(k_ptr),
-                     reinterpret_cast<const VDataType*>(v_ptr),
-                     reinterpret_cast<ODataType*>(o_ptr),
+        Kargs kargs{{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
                      seqlen_q,
                      seqlen_k,
                      hdim_q,
@@ -180,6 +195,7 @@ struct FmhaFwdKernel
                      nhead_stride_o}, // args for common karg
                     {},               // placeholder for bias
                     {},               // placeholder for mask
+                    {},               // placeholder for lse
                     batch_stride_q,
                     batch_stride_k,
                     batch_stride_v,
@@ -187,7 +203,7 @@ struct FmhaFwdKernel
 
         if constexpr(kHasBias)
         {
-            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(bias_ptr);
+            kargs.bias_ptr          = bias_ptr;
             kargs.stride_bias       = stride_bias;
             kargs.nhead_stride_bias = nhead_stride_bias;
             kargs.batch_stride_bias = batch_stride_bias;
@@ -198,6 +214,12 @@ struct FmhaFwdKernel
             kargs.mask_type   = mask_type;
             kargs.window_size = window_size;
         }
+        if constexpr(kStoreLSE)
+        {
+            kargs.lse_ptr          = lse_ptr;
+            kargs.nhead_stride_lse = nhead_stride_lse;
+            kargs.batch_stride_lse = batch_stride_lse;
+        }
 
         return kargs;
     }
@@ -207,6 +229,7 @@ struct FmhaFwdKernel
                                                                       const void* k_ptr,
                                                                       const void* v_ptr,
                                                                       const void* bias_ptr,
+                                                                      void* lse_ptr,
                                                                       void* o_ptr,
                                                                       const void* seqstart_q_ptr,
                                                                       const void* seqstart_k_ptr,
@@ -224,14 +247,15 @@ struct FmhaFwdKernel
                                                                       ck::index_t nhead_stride_k,
                                                                       ck::index_t nhead_stride_v,
                                                                       ck::index_t nhead_stride_bias,
+                                                                      ck::index_t nhead_stride_lse,
                                                                       ck::index_t nhead_stride_o,
                                                                       CausalMaskType mask_type,
                                                                       ck::index_t window_size)
     {
-        Kargs kargs{{reinterpret_cast<const QDataType*>(q_ptr),
-                     reinterpret_cast<const KDataType*>(k_ptr),
-                     reinterpret_cast<const VDataType*>(v_ptr),
-                     reinterpret_cast<ODataType*>(o_ptr),
+        Kargs kargs{{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
                      -1, // seqlen will be updated by another pointer
                      -1, //
                      hdim_q,
@@ -252,13 +276,14 @@ struct FmhaFwdKernel
                      nhead_stride_o}, // args for common karg
                     {},               // placeholder for bias
                     {},               // placeholder for mask
+                    {},               // placeholder for lse
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqstart_k_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
 
         if constexpr(kHasBias)
         {
-            kargs.bias_ptr          = reinterpret_cast<const BiasDataType*>(bias_ptr);
+            kargs.bias_ptr          = bias_ptr;
             kargs.stride_bias       = stride_bias;
             kargs.nhead_stride_bias = nhead_stride_bias;
         }
@@ -267,6 +292,11 @@ struct FmhaFwdKernel
             kargs.mask_type   = mask_type;
             kargs.window_size = window_size;
         }
+        if constexpr(kStoreLSE)
+        {
+            kargs.lse_ptr          = lse_ptr;
+            kargs.nhead_stride_lse = nhead_stride_lse;
+        }
 
         return kargs;
     }
@@ -306,6 +336,7 @@ struct FmhaFwdKernel
         long_index_t batch_offset_k    = 0;
         long_index_t batch_offset_v    = 0;
         long_index_t batch_offset_bias = 0;
+        long_index_t batch_offset_lse  = 0;
         long_index_t batch_offset_o    = 0;
 
         if constexpr(kIsGroupMode)
@@ -332,6 +363,10 @@ struct FmhaFwdKernel
             {
                 batch_offset_bias = key_start;
             }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = query_start;
+            }
             batch_offset_o = query_start * kargs.stride_o;
 
             // get real # queries & # keys under group mode
@@ -364,22 +399,27 @@ struct FmhaFwdKernel
             {
                 batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
             }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+            }
             batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
         }
 
         // for simplicity, batch stride we just modify the pointer
-        const QDataType* q_ptr = kargs.q_ptr +
+        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
                                  static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
                                  batch_offset_q;
         const KDataType* k_ptr =
-            kargs.k_ptr +
+            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
             static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
             batch_offset_k;
         const VDataType* v_ptr =
-            kargs.v_ptr +
+            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
             static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
             batch_offset_v;
-        ODataType* o_ptr = kargs.o_ptr + static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
                            batch_offset_o;
 
         // Q/K/V DRAM and DRAM window
@@ -526,7 +566,8 @@ struct FmhaFwdKernel
             if constexpr(kHasBias)
             {
                 const BiasDataType* bias_ptr =
-                    kargs.bias_ptr + static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
                     batch_offset_bias;
 
                 const auto bias_dram = [&]() {
@@ -550,6 +591,35 @@ struct FmhaFwdKernel
             }
         }();
 
+        // lse
+        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto lse_dram_window_lengths = make_tuple(Number<FmhaPipeline::kM0>{});
+            if constexpr(kStoreLSE)
+            {
+                LSEDataType* lse_ptr =
+                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
+
+                const auto lse_dram = [&]() {
+                    const auto lse_dram_naive =
+                        make_naive_tensor_view<AddressSpaceEnum::Global>(lse_ptr,
+                                                                         make_tuple(kargs.seqlen_q),
+                                                                         make_tuple(1),
+                                                                         Number<1>{},
+                                                                         Number<1>{});
+
+                    return pad_tensor_view(
+                        lse_dram_naive, lse_dram_window_lengths, Sequence<kM0NeedPadding>{});
+                }();
+
+                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+            }
+            else
+            {
+                return make_null_tile_window(lse_dram_window_lengths);
+            }
+        }();
+
         FmhaMask mask = [&]() {
             if constexpr(kHasMask)
             {
@@ -606,6 +676,7 @@ struct FmhaFwdKernel
                            k_dram_window,
                            v_dram_window,
                            bias_dram_window,
+                           lse_dram_window,
                            mask,
                            kargs.scale,
                            // ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index bc907c8a7..a52232cf0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -19,10 +19,8 @@
 #include <ck/tensor/tensor_view.hpp>
 
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qkvs_default_policy.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 #include <ck/tile_program/block_tile/block_masking.hpp>
@@ -60,6 +58,11 @@ struct grouped_infer_causalmask_attnbias_dispatched
             constexpr ck::index_t CONST_NAME = 128;                         \
             __VA_ARGS__();                                                  \
         }                                                                   \
+        else if(HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256)                       \
+        {                                                                   \
+            constexpr ck::index_t CONST_NAME = 256;                         \
+            __VA_ARGS__();                                                  \
+        }                                                                   \
         else                                                                \
         {                                                                   \
             throw std::runtime_error("Head-dim sizes not supported!");      \
@@ -75,6 +78,7 @@ struct grouped_infer_causalmask_attnbias_dispatched
         typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
         typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
         typename FmhaFwdTypeConfig<scalar_t>::PDataType,
         typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::ODataType,
@@ -110,15 +114,29 @@ struct grouped_infer_causalmask_attnbias_dispatched
                                                                         kN0K1NeedPadding,
                                                                         kK0N1NeedPadding,
                                                                         has_attn_bias,
+                                                                        false, // kStoreLSE
                                                                         occupancy>;
 
                     using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
-                    RunWithKernel<FmhaKernel>(param, stream);
+                    if constexpr(HDim == 256)
+                    {
+                        using FmhaPipeline =
+                            ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
+                        using FmhaKernel =
+                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                        RunWithKernel<FmhaKernel>(param, stream);
+                    }
+                    else
+                    {
+                        using FmhaPipeline =
+                            ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                        using FmhaKernel =
+                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                        RunWithKernel<FmhaKernel>(param, stream);
+                    }
                 });
             });
         });
@@ -133,6 +151,7 @@ struct grouped_infer_causalmask_attnbias_dispatched
                 param.k_ptr,
                 param.v_ptr,
                 param.attn_bias_ptr,
+                nullptr, // lse_ptr
                 param.out_ptr,
                 param.seqstart_q_dev_ptr,
                 param.seqstart_k_dev_ptr,
@@ -146,10 +165,11 @@ struct grouped_infer_causalmask_attnbias_dispatched
                 param.v_strides[0],
                 param.attn_bias_strides[2],
                 param.out_strides[0],
-                param.q_strides[1], // q, k, v, bias, out tensor head-dim stride
+                param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
                 param.k_strides[1],
                 param.v_strides[1],
                 param.attn_bias_strides[1],
+                0, // nhead_stride_lse
                 param.out_strides[1],
                 static_cast<CausalMaskType>(param.custom_mask_type),
                 param.window_size);

From 391af2b4e411440d1a2d65ff22a7a6c21f6afc83 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 19 Jan 2024 22:13:18 +0000
Subject: [PATCH 378/641] Add definition of FMHA_FWD_HEADDIM_SWITCH

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 33 +----------------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 33 +----------------
 .../hip_fmha/ck_tiled_headdim_switch.h        | 37 +++++++++++++++++++
 3 files changed, 41 insertions(+), 62 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 122e415ee..09c4ed668 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -32,6 +32,7 @@
 #include "ck_tiled_fmha_definitions.h"
 
 #include "ck_tiled_bool_switch.h"
+#include "ck_tiled_headdim_switch.h"
 
 template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
 struct batched_infer_causalmask_attnbias_dispatched
@@ -40,36 +41,6 @@ struct batched_infer_causalmask_attnbias_dispatched
         FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
                                                typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
-#ifndef BATCHED_INFER_HEADDIM_SWITCH
-#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
-    [&] {                                                                   \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                              \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 32;                          \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                         \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 64;                          \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else if(HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128)                       \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 128;                         \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else if(HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256)                       \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 256;                         \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else                                                                \
-        {                                                                   \
-            throw std::runtime_error("Head-dim sizes not supported!");      \
-        }                                                                   \
-    }()
-#endif
-
     template <typename FmhaTraits, ck::index_t HDim, typename FmhaMask>
     using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
         typename FmhaFwdTypeConfig<scalar_t>::QDataType,
@@ -98,7 +69,7 @@ struct batched_infer_causalmask_attnbias_dispatched
             using FmhaMask =
                 ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-            BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
                 using FmhaShape                 = FmhaFwdShape<HDim>;
                 using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
                 constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index a52232cf0..a996a5eea 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -32,6 +32,7 @@
 #include "ck_tiled_fmha_definitions.h"
 
 #include "ck_tiled_bool_switch.h"
+#include "ck_tiled_headdim_switch.h"
 
 template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
 struct grouped_infer_causalmask_attnbias_dispatched
@@ -40,36 +41,6 @@ struct grouped_infer_causalmask_attnbias_dispatched
         FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
                                                typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
-#ifndef GROUPED_INFER_HEADDIM_SWITCH
-#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
-    [&] {                                                                   \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                              \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 32;                          \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                         \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 64;                          \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else if(HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128)                       \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 128;                         \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else if(HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256)                       \
-        {                                                                   \
-            constexpr ck::index_t CONST_NAME = 256;                         \
-            __VA_ARGS__();                                                  \
-        }                                                                   \
-        else                                                                \
-        {                                                                   \
-            throw std::runtime_error("Head-dim sizes not supported!");      \
-        }                                                                   \
-    }()
-#endif
-
     template <typename FmhaTraits, ck::index_t HDim, typename FmhaMask>
     using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
         typename FmhaFwdTypeConfig<scalar_t>::QDataType,
@@ -98,7 +69,7 @@ struct grouped_infer_causalmask_attnbias_dispatched
             using FmhaMask =
                 ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-            GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
                 using FmhaShape                 = FmhaFwdShape<HDim>;
                 using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
                 constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
new file mode 100644
index 000000000..6043ebcd0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <stdexcept>
+
+#define FMHA_FWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
+    [&] {                                                              \
+        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                         \
+        {                                                              \
+            constexpr ck::index_t CONST_NAME = 32;                     \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                    \
+        {                                                              \
+            constexpr ck::index_t CONST_NAME = 64;                     \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else if(HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128)                  \
+        {                                                              \
+            constexpr ck::index_t CONST_NAME = 128;                    \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else if(HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256)                  \
+        {                                                              \
+            constexpr ck::index_t CONST_NAME = 256;                    \
+            __VA_ARGS__();                                             \
+        }                                                              \
+        else                                                           \
+        {                                                              \
+            throw std::runtime_error("Head-dim sizes not supported!"); \
+        }                                                              \
+    }()

From 53719f96015333e9364643b0aba5ba4374e4b276 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 19 Jan 2024 23:22:29 +0000
Subject: [PATCH 379/641] Split the ck-tiled inference instances based on
 head-dim sizes to improve compiling

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 124 +++++++++---------
 .../ck_tiled_fmha_batched_infer_bp16.cpp      |  59 ++++++---
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  59 ++++++---
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  88 ++++++-------
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      |  59 ++++++---
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  59 ++++++---
 .../attention/hip_fmha/instances_tiled/\\"    |   2 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 ++
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_32.cpp} |   2 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |  12 ++
 ...d_infer_fp16_no_causalmask_no_attnbias.cpp |  12 --
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...infer_fp16_no_causalmask_with_attnbias.cpp |  12 --
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 ++
 ...infer_fp16_with_causalmask_no_attnbias.cpp |  12 --
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...fer_fp16_with_causalmask_with_attnbias.cpp |  12 --
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_32.cpp} |   2 +-
 ...h_causalmask_with_attnbias_headdim_64.cpp} |   2 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 ++
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_32.cpp} |   2 +-
 ...h_causalmask_with_attnbias_headdim_64.cpp} |   2 +-
 ...d_infer_fp16_no_causalmask_no_attnbias.cpp |  12 --
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...infer_fp16_no_causalmask_with_attnbias.cpp |  12 --
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 ++
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 ++
 ...infer_fp16_with_causalmask_no_attnbias.cpp |  12 --
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 ++
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 ++
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 ++
 ...fer_fp16_with_causalmask_with_attnbias.cpp |  12 --
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 ++
 ...h_causalmask_with_attnbias_headdim_32.cpp} |   2 +-
 ...h_causalmask_with_attnbias_headdim_64.cpp} |   2 +-
 79 files changed, 971 insertions(+), 273 deletions(-)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp => "xformers/csrc/attention/hip_fmha/instances_tiled/\\" (93%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp} (93%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp} (93%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp} (93%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp} (93%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp} (93%)
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp} (93%)
 rename xformers/csrc/attention/hip_fmha/instances_tiled/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp} (93%)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 09c4ed668..221dd467c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -34,14 +34,14 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_headdim_switch.h"
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
 struct batched_infer_causalmask_attnbias_dispatched
 {
     using FmhaEpilogue =
         FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
                                                typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
-    template <typename FmhaTraits, ck::index_t HDim, typename FmhaMask>
+    template <typename FmhaTraits, typename FmhaMask>
     using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
         typename FmhaFwdTypeConfig<scalar_t>::QDataType,
         typename FmhaFwdTypeConfig<scalar_t>::KDataType,
@@ -69,41 +69,54 @@ struct batched_infer_causalmask_attnbias_dispatched
             using FmhaMask =
                 ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-            FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-                using FmhaShape                 = FmhaFwdShape<HDim>;
-                using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-                constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
-
-                bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
-                bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
-
-                // ToDO: current pipelines all assume kQLoadOnce, which read whole k0
-                // (kK0BlockLength)
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-                BOOL_SWITCH_3(
-                    m0_need_padding,
-                    kM0NeedPadding,
-                    n0k1_need_padding,
-                    kN0K1NeedPadding,
-                    k0n1_need_padding,
-                    kK0N1NeedPadding,
-                    [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                            kN0K1NeedPadding,
-                                                                            kK0N1NeedPadding,
-                                                                            has_attn_bias,
-                                                                            false, // kStoreLSE
-                                                                            occupancy>;
-
-                        using FmhaPipelineProblem =
-                            FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
-
-                        if constexpr(HDim == 256)
+            using FmhaShape                 = FmhaFwdShape<HDim>;
+            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+
+            bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
+            bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
+
+            // ToDO: current pipelines all assume kQLoadOnce, which read whole k0
+            // (kK0BlockLength)
+            bool k0n1_need_padding =
+                !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+            BOOL_SWITCH_3(
+                m0_need_padding,
+                kM0NeedPadding,
+                n0k1_need_padding,
+                kN0K1NeedPadding,
+                k0n1_need_padding,
+                kK0N1NeedPadding,
+                [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                        kN0K1NeedPadding,
+                                                                        kK0N1NeedPadding,
+                                                                        has_attn_bias,
+                                                                        false, // kStoreLSE
+                                                                        occupancy>;
+
+                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                    if constexpr(HDim == 256)
+                    {
+                        using FmhaPipeline =
+                            ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
+                        using FmhaKernel =
+                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                        RunWithKernel<FmhaKernel>(param, stream);
+                    }
+                    else
+                    {
+                        constexpr bool no_any_padding =
+                            !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
+
+                        if constexpr(no_any_padding)
                         {
-                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQSKSVS<
-                                FmhaPipelineProblem>;
+                            using FmhaPipeline =
+                                ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                                    FmhaPipelineProblem>;
                             using FmhaKernel =
                                 FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
@@ -111,32 +124,15 @@ struct batched_infer_causalmask_attnbias_dispatched
                         }
                         else
                         {
-                            constexpr bool no_any_padding =
-                                !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
-
-                            if constexpr(no_any_padding)
-                            {
-                                using FmhaPipeline =
-                                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                                        FmhaPipelineProblem>;
-                                using FmhaKernel =
-                                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                                RunWithKernel<FmhaKernel>(param, stream);
-                            }
-                            else
-                            {
-                                using FmhaPipeline =
-                                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                                        FmhaPipelineProblem>;
-                                using FmhaKernel =
-                                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                                RunWithKernel<FmhaKernel>(param, stream);
-                            };
+                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                                FmhaPipelineProblem>;
+                            using FmhaKernel =
+                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                            RunWithKernel<FmhaKernel>(param, stream);
                         };
-                    });
-            });
+                    };
+                });
         });
     };
 
@@ -187,10 +183,10 @@ struct batched_infer_causalmask_attnbias_dispatched
     };
 };
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
 void run_batched_infer_causalmask_attnbias_dispatched(BatchedForwardParams& param,
                                                       hipStream_t stream)
 {
-    batched_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias>::Run(
-        param, stream);
+    batched_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
+        Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index 815fee897..93b7be27a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -11,31 +11,60 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+// clang-format off
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
+// clang-format on
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream)
 {
     BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 1)
-            run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 2)
-            run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                 false,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                 true,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
     });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 3f3a61fb0..170af665d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -11,31 +11,60 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
+// clang-format off
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
+// clang-format on
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream)
 {
     BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 1)
-            run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 2)
-            run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_batched_infer_causalmask_attnbias_dispatched<ck::half_t,
+                                                                 false,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_batched_infer_causalmask_attnbias_dispatched<ck::half_t,
+                                                                 true,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
     });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index a996a5eea..ce3585c09 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -34,14 +34,14 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_headdim_switch.h"
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
 struct grouped_infer_causalmask_attnbias_dispatched
 {
     using FmhaEpilogue =
         FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
                                                typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
-    template <typename FmhaTraits, ck::index_t HDim, typename FmhaMask>
+    template <typename FmhaTraits, typename FmhaMask>
     using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
         typename FmhaFwdTypeConfig<scalar_t>::QDataType,
         typename FmhaFwdTypeConfig<scalar_t>::KDataType,
@@ -69,46 +69,44 @@ struct grouped_infer_causalmask_attnbias_dispatched
             using FmhaMask =
                 ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-            FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-                using FmhaShape                 = FmhaFwdShape<HDim>;
-                using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-                constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
-
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-                constexpr bool kM0NeedPadding   = true;
-                constexpr bool kN0K1NeedPadding = true;
-
-                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                        kN0K1NeedPadding,
-                                                                        kK0N1NeedPadding,
-                                                                        has_attn_bias,
-                                                                        false, // kStoreLSE
-                                                                        occupancy>;
-
-                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, HDim, FmhaMask>;
-
-                    if constexpr(HDim == 256)
-                    {
-                        using FmhaPipeline =
-                            ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
-                        using FmhaKernel =
-                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                        RunWithKernel<FmhaKernel>(param, stream);
-                    }
-                    else
-                    {
-                        using FmhaPipeline =
-                            ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                        using FmhaKernel =
-                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                        RunWithKernel<FmhaKernel>(param, stream);
-                    }
-                });
+            using FmhaShape                 = FmhaFwdShape<HDim>;
+            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+
+            bool k0n1_need_padding =
+                !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+            constexpr bool kM0NeedPadding   = true;
+            constexpr bool kN0K1NeedPadding = true;
+
+            BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                    kN0K1NeedPadding,
+                                                                    kK0N1NeedPadding,
+                                                                    has_attn_bias,
+                                                                    false, // kStoreLSE
+                                                                    occupancy>;
+
+                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                if constexpr(HDim == 256)
+                {
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                }
+                else
+                {
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                }
             });
         });
     };
@@ -156,10 +154,10 @@ struct grouped_infer_causalmask_attnbias_dispatched
     };
 };
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias>
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
 void run_grouped_infer_causalmask_attnbias_dispatched(GroupedForwardParams& param,
                                                       hipStream_t stream)
 {
-    grouped_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias>::Run(
-        param, stream);
+    grouped_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
+        Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index f942d1bbb..5402ac327 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -11,31 +11,60 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+// clang-format off
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
+// clang-format on
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream)
 {
     BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 1)
-            run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 2)
-            run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                 false,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                 true,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
     });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 288ad5f57..17623121b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -11,31 +11,60 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
+// clang-format off
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
+// clang-format on
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream)
 {
     BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 1)
-            run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else if(param.custom_mask_type == 2)
-            run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, HAS_ATTN_BIAS>(
-                param, stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t,
+                                                                 false,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t,
+                                                                 true,
+                                                                 HAS_ATTN_BIAS,
+                                                                 HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
     });
 };
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp "b/xformers/csrc/attention/hip_fmha/instances_tiled/\\"
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp
rename to "xformers/csrc/attention/hip_fmha/instances_tiled/\\"
index 55100393d..e7f76cd58 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias.cpp
+++ "b/xformers/csrc/attention/hip_fmha/instances_tiled/\\"
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..17c5ab864
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..38b8aa3b7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..f2d976897
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..a8d2b933a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..bcee71741
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..485ff4b64
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..496c34c61
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..f52e8fcd8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..2b593af2b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..54871d2ed
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..3f7d86019
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..400f0aaa4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..f9063434c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..31831836f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
index 36438844e..4866c0148 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..c87e7d2c2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp
deleted file mode 100644
index 06957d596..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
-    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..d2b894e6b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..a55ac98be
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..ab5c8bb2c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..282750da4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp
deleted file mode 100644
index cae5a03c1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
-    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..17d3a203b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..e4e7645e8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..1b3a9a7c8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..64c00b096
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp
deleted file mode 100644
index f5a42d733..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
-    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..9d24c03b9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..ab81e906d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..5417efb52
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..3b55e45b8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp
deleted file mode 100644
index 9f79c2ed5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
-    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..e7f76cd58
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..2d5edfc0f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
index 4c06d77aa..ff21e5051 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
index 407f20ab4..316457d7b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..66d6ce7de
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..819794d6f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..fa94726d7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..d8f96bdb9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..c42eade65
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..357eb57b1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..6ad131cd6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..f6131197a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..15c6d599a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..7f7229c8b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..bdc6996c2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..15ac95e27
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..4bd616c5d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..05e935716
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
index 716a48b9c..a72f0e811 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false>(
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
index f79e7ee14..99e86651c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true>(
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp
deleted file mode 100644
index 8a68b03d6..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false>(
-    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..18e2f8bac
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..5bdf3d87e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..584be8667
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..70b023ba0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp
deleted file mode 100644
index 9fb627dc1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true>(
-    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..082912ca6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..15ccf9a44
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..dbfcfa438
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..c55043820
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp
deleted file mode 100644
index dff263668..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false>(
-    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..616c49912
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..895740585
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..558f63474
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..000c3f3ca
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp
deleted file mode 100644
index 86cc2f3eb..000000000
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_grouped_infer.h"
-
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true>(
-    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..39f45768e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..6028a16df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
index 9a16d8160..105ee9025 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false>(
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 93%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp
rename to xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
index 9d5260deb..f7f86a773 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,5 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true>(
+template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);

From 92e088ef6f964bcd519c34185a4b615dc3f6b3b9 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 20 Jan 2024 16:17:36 +0000
Subject: [PATCH 380/641] Setting k0n1_need_padding according to pipeline
 kQLoadOnce implementation

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 79 ++++++++++++-------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 52 +++++++-----
 2 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 221dd467c..4ebe09304 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -76,39 +76,60 @@ struct batched_infer_causalmask_attnbias_dispatched
             bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
             bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
 
-            // ToDO: current pipelines all assume kQLoadOnce, which read whole k0
-            // (kK0BlockLength)
-            bool k0n1_need_padding =
-                !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-            BOOL_SWITCH_3(
-                m0_need_padding,
-                kM0NeedPadding,
-                n0k1_need_padding,
-                kN0K1NeedPadding,
-                k0n1_need_padding,
-                kK0N1NeedPadding,
-                [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                        kN0K1NeedPadding,
-                                                                        kK0N1NeedPadding,
-                                                                        has_attn_bias,
-                                                                        false, // kStoreLSE
-                                                                        occupancy>;
-
-                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                    if constexpr(HDim == 256)
-                    {
+            if constexpr(HDim == 256)
+            {
+                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH_3(
+                    m0_need_padding,
+                    kM0NeedPadding,
+                    n0k1_need_padding,
+                    kN0K1NeedPadding,
+                    k0n1_need_padding,
+                    kK0N1NeedPadding,
+                    [&] {
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                            kN0K1NeedPadding,
+                                                                            kK0N1NeedPadding,
+                                                                            has_attn_bias,
+                                                                            false, // kStoreLSE
+                                                                            occupancy>;
+
+                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
                         using FmhaPipeline =
                             ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
                         using FmhaKernel =
                             FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
                         RunWithKernel<FmhaKernel>(param, stream);
-                    }
-                    else
-                    {
+                    });
+            }
+            else
+            {
+                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH_3(
+                    m0_need_padding,
+                    kM0NeedPadding,
+                    n0k1_need_padding,
+                    kN0K1NeedPadding,
+                    k0n1_need_padding,
+                    kK0N1NeedPadding,
+                    [&] {
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                            kN0K1NeedPadding,
+                                                                            kK0N1NeedPadding,
+                                                                            has_attn_bias,
+                                                                            false, // kStoreLSE
+                                                                            occupancy>;
+
+                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
                         constexpr bool no_any_padding =
                             !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
 
@@ -131,8 +152,8 @@ struct batched_infer_causalmask_attnbias_dispatched
 
                             RunWithKernel<FmhaKernel>(param, stream);
                         };
-                    };
-                });
+                    });
+            };
         });
     };
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index ce3585c09..2909ee5fa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -73,41 +73,57 @@ struct grouped_infer_causalmask_attnbias_dispatched
             using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
             constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
 
-            bool k0n1_need_padding =
-                !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
-
             constexpr bool kM0NeedPadding   = true;
             constexpr bool kN0K1NeedPadding = true;
 
-            BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                    kN0K1NeedPadding,
-                                                                    kK0N1NeedPadding,
-                                                                    has_attn_bias,
-                                                                    false, // kStoreLSE
-                                                                    occupancy>;
+            if constexpr(HDim == 256)
+            {
+                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                        kN0K1NeedPadding,
+                                                                        kK0N1NeedPadding,
+                                                                        has_attn_bias,
+                                                                        false, // kStoreLSE
+                                                                        occupancy>;
 
-                using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
-                if constexpr(HDim == 256)
-                {
                     using FmhaPipeline =
                         ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
                     using FmhaKernel =
                         FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
                     RunWithKernel<FmhaKernel>(param, stream);
-                }
-                else
-                {
+                });
+            }
+            else
+            {
+                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                        kN0K1NeedPadding,
+                                                                        kK0N1NeedPadding,
+                                                                        has_attn_bias,
+                                                                        false, // kStoreLSE
+                                                                        occupancy>;
+
+                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
                     using FmhaPipeline =
                         ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
                     using FmhaKernel =
                         FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
                     RunWithKernel<FmhaKernel>(param, stream);
-                }
-            });
+                });
+            };
         });
     };
 

From 60a8e4a41e05acee630d90df848928447bca1032 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 21 Jan 2024 21:03:44 +0000
Subject: [PATCH 381/641] Add fmha forward c++ extension for ck-tiled

---
 setup.py                                      |   2 +
 .../attention_forward_generic_ck_tiled.cpp    |  80 +++----
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 213 ++++++++++++++++++
 .../ck_tiled_fmha_batched_forward_bp16.cpp    |  70 ++++++
 .../ck_tiled_fmha_batched_forward_fp16.cpp    |  70 ++++++
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 179 +++++++++++++++
 .../ck_tiled_fmha_grouped_forward_bp16.cpp    |  70 ++++++
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    |  70 ++++++
 .../attention/hip_fmha/ck_tiled_fmha_params.h |   2 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ..._no_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...o_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...no_causalmask_with_attnbias_headdim_64.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_128.cpp |  12 +
 ...ith_causalmask_no_attnbias_headdim_256.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_32.cpp |  12 +
 ...with_causalmask_no_attnbias_headdim_64.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_128.cpp |  12 +
 ...h_causalmask_with_attnbias_headdim_256.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_32.cpp |  12 +
 ...th_causalmask_with_attnbias_headdim_64.cpp |  12 +
 73 files changed, 1475 insertions(+), 49 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp

diff --git a/setup.py b/setup.py
index 84629d229..bebc6c04f 100644
--- a/setup.py
+++ b/setup.py
@@ -240,6 +240,8 @@ def get_extensions():
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_infer_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_forward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_forward_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
     else:
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index d63f0d6bf..b27626706 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -21,20 +21,10 @@
 #include "ck_fmha_util.h"
 #include "ck_tiled_fmha_params.h"
 
-/*
-extern void batched_forward_fp16(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-extern void batched_forward_bp16(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-extern void grouped_forward_fp16(
-    GroupedForwardParams& param,
-    hipStream_t stream);
-extern void grouped_forward_bp16(
-    GroupedForwardParams& param,
-    hipStream_t stream);
-*/
+extern void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream);
+extern void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream);
+extern void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream);
+extern void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream);
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
 extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
@@ -225,10 +215,8 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
 
         if(p.compute_logsumexp)
         {
-            /*
-            logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
+            logsumexp       = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
             p.logsumexp_ptr = logsumexp.data_ptr();
-            */
             throw std::runtime_error("compute logsumexp is currently not implemented by ck-tiled!");
         }
         else
@@ -348,21 +336,11 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
 
         if(p.compute_logsumexp)
         {
-            /*
-            logsumexp = at::empty(
-                {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
-            char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-            for (int i = 0; i < p.num_batches; i++) {
-              size_t tmp_logsumexp_offset = get_size_in_bytes(
-                  static_cast<size_t>(i) * Hq * p.max_seqlen_q,
-                  logsumexp.scalar_type());
-              p.logsumexp_ptrs.push_back(
-                  reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-            };
-            */
-            throw std::runtime_error("compute logsumexp is currently not implemented by ck-tiled!");
-        };
+            logsumexp = at::empty({p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+            p.logsumexp_ptr = logsumexp.data_ptr();
+        }
+        else
+            p.logsumexp_ptr = nullptr;
     };
 
     auto inDataType = query.scalar_type();
@@ -388,14 +366,17 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
         }
         else
         {
-            /*
-            if (inDataType == at::ScalarType::Half) {
-              batched_forward_fp16(batched_forward_params, stream);
-            } else if (inDataType == at::ScalarType::BFloat16) {
-              batched_forward_bp16(batched_forward_params, stream);
-            } else
-              throw std::runtime_error("input data-type is not supported!");
-            */
+            if(inDataType == at::ScalarType::Half)
+            {
+                batched_forward_fp16(batched_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                batched_forward_bp16(batched_forward_params, stream);
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+
             throw std::runtime_error(
                 "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
         };
@@ -421,14 +402,17 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
         }
         else
         {
-            /*
-            if (inDataType == at::ScalarType::Half) {
-              grouped_forward_fp16(grouped_forward_params, stream);
-            } else if (inDataType == at::ScalarType::BFloat16) {
-              grouped_forward_bp16(grouped_forward_params, stream);
-            } else
-              throw std::runtime_error("input data-type is not supported!");
-            */
+            if(inDataType == at::ScalarType::Half)
+            {
+                grouped_forward_fp16(grouped_forward_params, stream);
+            }
+            else if(inDataType == at::ScalarType::BFloat16)
+            {
+                grouped_forward_bp16(grouped_forward_params, stream);
+            }
+            else
+                throw std::runtime_error("input data-type is not supported!");
+
             throw std::runtime_error(
                 "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
         };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
new file mode 100644
index 000000000..dd684d9f2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+
+#include <ck/utility/common_header.hpp>
+#include <ck/host_utility/device_prop.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor_description/cluster_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/tensor/tensor_view.hpp>
+
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck/tile_program/block_tile/block_masking.hpp>
+
+#include "ck_tiled_fmha_forward_kernel.h"
+#include "ck_tiled_fmha_fwd_epilogue.h"
+#include "ck_tiled_fmha_fwd_tile_partitioner.h"
+#include "ck_tiled_fmha_params.h"
+#include "ck_tiled_fmha_definitions.h"
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_headdim_switch.h"
+
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
+struct batched_forward_causalmask_attnbias_dispatched
+{
+    using FmhaEpilogue =
+        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
+
+    template <typename FmhaTraits, typename FmhaMask>
+    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
+        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+        HDim == 32 ? 128 : 256, // BlockSize
+        FmhaFwdShape<HDim>,
+        false, // kIsGroupMode
+        FmhaMask,
+        FmhaTraits>;
+
+    static void Run(BatchedForwardParams& param, hipStream_t stream)
+    {
+        const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+            using FmhaMask =
+                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+            using FmhaShape                 = FmhaFwdShape<HDim>;
+            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+
+            bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
+            bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
+
+            if constexpr(HDim == 256)
+            {
+                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH_3(
+                    m0_need_padding,
+                    kM0NeedPadding,
+                    n0k1_need_padding,
+                    kN0K1NeedPadding,
+                    k0n1_need_padding,
+                    kK0N1NeedPadding,
+                    [&] {
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                            kN0K1NeedPadding,
+                                                                            kK0N1NeedPadding,
+                                                                            has_attn_bias,
+                                                                            true, // kStoreLSE
+                                                                            occupancy>;
+
+                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                        using FmhaPipeline =
+                            ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
+                        using FmhaKernel =
+                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                        RunWithKernel<FmhaKernel>(param, stream);
+                    });
+            }
+            else
+            {
+                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH_3(
+                    m0_need_padding,
+                    kM0NeedPadding,
+                    n0k1_need_padding,
+                    kN0K1NeedPadding,
+                    k0n1_need_padding,
+                    kK0N1NeedPadding,
+                    [&] {
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                            kN0K1NeedPadding,
+                                                                            kK0N1NeedPadding,
+                                                                            has_attn_bias,
+                                                                            true, // kStoreLSE
+                                                                            occupancy>;
+
+                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                        constexpr bool no_any_padding =
+                            !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
+
+                        if constexpr(no_any_padding)
+                        {
+                            using FmhaPipeline =
+                                ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                                    FmhaPipelineProblem>;
+                            using FmhaKernel =
+                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                            RunWithKernel<FmhaKernel>(param, stream);
+                        }
+                        else
+                        {
+                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                                FmhaPipelineProblem>;
+                            using FmhaKernel =
+                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                            RunWithKernel<FmhaKernel>(param, stream);
+                        };
+                    });
+            };
+        });
+    };
+
+    template <typename FmhaKernel>
+    static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream)
+    {
+        const auto kargs = [&] {
+            return FmhaKernel::MakeKargs(
+                param.q_ptr,
+                param.k_ptr,
+                param.v_ptr,
+                param.attn_bias_ptr,
+                param.logsumexp_ptr,
+                param.out_ptr,
+                param.M,              // seqlen_q
+                param.N,              // seqlen_k
+                param.K,              // hdim_q
+                param.Kv,             // hdim_v
+                param.Hq / param.Hkv, // nhead_ratio_qk
+                param.scale,
+                param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
+                param.k_strides[1],
+                param.v_strides[1],
+                param.attn_bias_strides[2],
+                param.out_strides[1],
+                param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
+                param.k_strides[2],
+                param.v_strides[2],
+                param.attn_bias_strides[1],
+                param.M, // nhead_stride_lse
+                param.out_strides[2],
+                param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
+                param.k_strides[0],
+                param.v_strides[0],
+                param.attn_bias_strides[0],
+                param.Hq * param.M, // batch_stride_lse
+                param.out_strides[0],
+                static_cast<CausalMaskType>(param.custom_mask_type),
+                param.window_size);
+        }();
+
+        dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
+        constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
+    };
+};
+
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
+void run_batched_forward_causalmask_attnbias_dispatched(BatchedForwardParams& param,
+                                                        hipStream_t stream)
+{
+    batched_forward_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
+        Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
new file mode 100644
index 000000000..7bdf6cfd7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_batched_forward.h"
+
+// clang-format off
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+// clang-format on
+
+void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                   false,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                   true,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
+    });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
new file mode 100644
index 000000000..05abf084e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_batched_forward.h"
+
+// clang-format off
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+// clang-format on
+
+void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_batched_forward_causalmask_attnbias_dispatched<ck::half_t,
+                                                                   false,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_batched_forward_causalmask_attnbias_dispatched<ck::half_t,
+                                                                   true,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
+    });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
new file mode 100644
index 000000000..9e784052c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <iostream>
+
+#include <ck/utility/common_header.hpp>
+#include <ck/host_utility/device_prop.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor_description/cluster_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/tensor/tensor_view.hpp>
+
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck/tile_program/block_tile/block_masking.hpp>
+
+#include "ck_tiled_fmha_forward_kernel.h"
+#include "ck_tiled_fmha_fwd_epilogue.h"
+#include "ck_tiled_fmha_fwd_tile_partitioner.h"
+#include "ck_tiled_fmha_params.h"
+#include "ck_tiled_fmha_definitions.h"
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_headdim_switch.h"
+
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
+struct grouped_forward_causalmask_attnbias_dispatched
+{
+    using FmhaEpilogue =
+        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
+
+    template <typename FmhaTraits, typename FmhaMask>
+    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
+        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+        HDim == 32 ? 128 : 256, // BlockSize
+        FmhaFwdShape<HDim>,
+        true, // kIsGroupMode
+        FmhaMask,
+        FmhaTraits>;
+
+    static void Run(GroupedForwardParams& param, hipStream_t stream)
+    {
+        const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+            using FmhaMask =
+                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+            using FmhaShape                 = FmhaFwdShape<HDim>;
+            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+
+            constexpr bool kM0NeedPadding   = true;
+            constexpr bool kN0K1NeedPadding = true;
+
+            if constexpr(HDim == 256)
+            {
+                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                        kN0K1NeedPadding,
+                                                                        kK0N1NeedPadding,
+                                                                        has_attn_bias,
+                                                                        true, // kStoreLSE
+                                                                        occupancy>;
+
+                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                });
+            }
+            else
+            {
+                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
+                bool k0n1_need_padding =
+                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+
+                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
+                                                                        kN0K1NeedPadding,
+                                                                        kK0N1NeedPadding,
+                                                                        has_attn_bias,
+                                                                        true, // kStoreLSE
+                                                                        occupancy>;
+
+                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                    using FmhaPipeline =
+                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
+                    using FmhaKernel =
+                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+                    RunWithKernel<FmhaKernel>(param, stream);
+                });
+            };
+        });
+    };
+
+    template <typename FmhaKernel>
+    static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream)
+    {
+        const auto kargs = [&] {
+            return FmhaKernel::MakeKargs(
+                param.q_ptr,
+                param.k_ptr,
+                param.v_ptr,
+                param.attn_bias_ptr,
+                param.logsumexp_ptr,
+                param.out_ptr,
+                param.seqstart_q_dev_ptr,
+                param.seqstart_k_dev_ptr,
+                param.seqlen_k_dev_ptr,
+                param.K,              // hdim_q
+                param.Kv,             // hdim_v
+                param.Hq / param.Hkv, // nhead_ratio_qk
+                param.scale,
+                param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
+                param.k_strides[0],
+                param.v_strides[0],
+                param.attn_bias_strides[2],
+                param.out_strides[0],
+                param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
+                param.k_strides[1],
+                param.v_strides[1],
+                param.attn_bias_strides[1],
+                param.max_seqlen_q, // nhead_stride_lse
+                param.out_strides[1],
+                static_cast<CausalMaskType>(param.custom_mask_type),
+                param.window_size);
+        }();
+
+        dim3 kGridSize =
+            FmhaKernel::GridSize(param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
+        constexpr dim3 kBlockSize         = FmhaKernel::BlockSize();
+        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
+    };
+};
+
+template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
+void run_grouped_forward_causalmask_attnbias_dispatched(GroupedForwardParams& param,
+                                                        hipStream_t stream)
+{
+    grouped_forward_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
+        Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
new file mode 100644
index 000000000..5606f13e5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_grouped_forward.h"
+
+// clang-format off
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+// clang-format on
+
+void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                   false,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
+                                                                   true,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
+    });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
new file mode 100644
index 000000000..63b3e7b96
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_grouped_forward.h"
+
+// clang-format off
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+// clang-format on
+
+void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream)
+{
+    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+            if(param.custom_mask_type == 0)
+                run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t,
+                                                                   false,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
+                run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t,
+                                                                   true,
+                                                                   HAS_ATTN_BIAS,
+                                                                   HDim>(param, stream);
+            else
+                throw std::runtime_error("Invalid custom_mask_type value");
+        });
+    });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 11274c5c4..e518ccaaa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -102,7 +102,7 @@ struct GroupedForwardParams : public GroupedInferParams
     int64_t philox_offset;
 
     // completely contiguous
-    std::vector<void*> logsumexp_ptrs;
+    void* logsumexp_ptr;
 
     // TODO: need remove this after dev-op fix
     std::vector<void*> randvals_ptrs;
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..ab8b8f270
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..bff652986
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..7c7e53df5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..a2cefd689
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..4bce63f3d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..fd9fee064
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..8a4583c6f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..e3ddab117
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..2726966fa
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..5158b5c44
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..25a8f9316
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..b174cd641
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..941488b93
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..986dfe9df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..d1590b38d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..b245f5715
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..2bf4db3f8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..41029c7dc
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..c0df0271a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..52b129eb2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..b8a496fed
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..53a9328c6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..5ee4e29f4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..3d9791d33
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..ef0eae81d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..a5870aacf
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..a8cc8231a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..c7b13e92e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..4911aba00
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..42e4a7a93
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..d43b65227
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..bce8348c6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..ede42cd70
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..4452ef80e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..7de8d370c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..66f084dc4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..894b979d0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..53346a196
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..fc0329da0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..4e169225d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..19e997418
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..86cb616c3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..f9b6f38eb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..64433cc55
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..b2df4367b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..de62061b5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..604a12985
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..985fe0a74
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..7c905fcc1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..bcd9cbf9a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..0be43523f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..fd490972a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..0722ee7df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..9d6178ab8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..db9e4fbd5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..ae0842444
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..fe1c3f8c0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..d246e0dca
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..611d7bfb8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..2b9d7a2c6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
new file mode 100644
index 000000000..165e61310
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
new file mode 100644
index 000000000..5496abe4c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
new file mode 100644
index 000000000..deb14598a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
new file mode 100644
index 000000000..f803b0f05
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    GroupedForwardParams& param, hipStream_t stream);

From 9357a2405b0fcfaa839fb14fa467b8c3715c4c54 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 22 Jan 2024 13:54:21 +0000
Subject: [PATCH 382/641] Set SUPPORTED_MAX_K=256 in ck.py

---
 tests/test_mem_eff_attention_ck.py | 3 ---
 xformers/ops/fmha/ck.py            | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 2caf187be..313185cbb 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -437,9 +437,6 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if k > 256 or kv > 256:
-        pytest.skip("head-dim size bigger than 256 is not supported by CK-FlashAttention")
-
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 200f6a41b..0ecc7f317 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -157,7 +157,7 @@ class FwOp(AttentionFwOpBase):
     OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
-    SUPPORTED_MAX_K = 65536
+    SUPPORTED_MAX_K = 256 
     
     if use_ck_tiled: 
         SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {

From 04ddd4c3f5f306f7a883f8b3baaa191233e89ce8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 24 Jan 2024 00:06:36 +0000
Subject: [PATCH 383/641] fix index in split-k attention

---
 .../csrc/attention/hip_fmha/attention_forward_splitk.cpp  | 8 ++++----
 .../hip_fmha/ck_attention_forward_decoder_splitk.h        | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 8ac38a440..ae514108a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -923,9 +923,9 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k, /* block_size */ 16);
+    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k, /* block_size */ kWavefrontsPerBlock * 16);
 
-    auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, split_k, /* wavefronts_per_block */ 1);
+    auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
 
     auto O_percent_mismatch = percent_mismatch(O_ref, O_hip);
     auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
@@ -949,7 +949,7 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
 static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k) {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split_attention_hip(XQ, K, V, seqlen, split_k, /* wavefronts_per_block */ 1);
+    auto [O_ref, m_ref, l_ref] = split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
 
     auto O_torch = split_reduce_torch(O_ref, m_ref.unsqueeze(0), l_ref.unsqueeze(0), split_k);
     auto O_hip = split_reduce_hip(O_ref, m_ref, l_ref, split_k);
@@ -965,7 +965,7 @@ static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_s
 
     double qk_scale        = 1. / sqrt(XQ.size(-1));
 
-    auto result = efficient_attention_forward_decoder_splitk_ck_impl</* threads_per_wavefront */ 64, /* wavefronts_per_block */ 1>(
+    auto result = efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront, kWavefrontsPerBlock>(
         XQ, K, V, seqlen, qk_scale, split_k);
     auto gold_result = efficient_attention_forward_decoder_splitk_torch(XQ, K, V, seqlen, qk_scale, /* split_k */ 1, /* block_size */ 1);
     auto e2e_mismatch = percent_mismatch(result, gold_result);
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index a4c61f127..38ca82600 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -356,7 +356,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
 
     // each wavefront computes partial sum of exp.
     compute_t softmax_denominator = 0.0f;
-    for(int32_t t = tt_low + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
+    for(int32_t t = n_unrolled_loops * dtt * split_idx + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
     {
         softmax_denominator += ck::math::exp(smem[t - n_unrolled_loops * dtt * split_idx] - max_qk_acc);
     }
@@ -384,7 +384,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     }
 
     // now, compute the normalization across all threads.
-    for(int32_t t = tt_low + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
+    for(int32_t t = n_unrolled_loops * dtt * split_idx + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
     {
         // softmax scale by sumexp will happen in the reduction kernel
         smem[t - n_unrolled_loops * dtt * split_idx] = ck::math::exp(smem[t - n_unrolled_loops * dtt * split_idx] - max_qk_acc);

From c922d7333296f5caddbbcf04445cb66417b64bf6 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 24 Jan 2024 01:33:30 +0000
Subject: [PATCH 384/641] fix index in softmax reduce and complete fixing
 wavefronts per block optimization

---
 .../hip_fmha/attention_forward_splitk.cpp        |  2 +-
 .../ck_attention_forward_decoder_splitk.h        | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index ae514108a..6a1eb8044 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -8,7 +8,7 @@
 
 namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock  = 1;
+constexpr int32_t kWavefrontsPerBlock  = 8;
 constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
 } // namespace
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 38ca82600..5237231ff 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -355,10 +355,15 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     }
 
     // each wavefront computes partial sum of exp.
+    { // softmax reduce begin
     compute_t softmax_denominator = 0.0f;
-    for(int32_t t = n_unrolled_loops * dtt * split_idx + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
+    const int32_t t_low = n_unrolled_loops * dtt * split_idx;
+    const int32_t t_high = (split_idx + 1 < split_k) ? n_unrolled_loops * dtt * (split_idx + 1) : t_max;
+    for(int32_t t = t_low + thread_linear_idx; 
+        t < t_high; 
+        t += threads_per_block)
     {
-        softmax_denominator += ck::math::exp(smem[t - n_unrolled_loops * dtt * split_idx] - max_qk_acc);
+        softmax_denominator += ck::math::exp(smem[t - t_low] - max_qk_acc);
     }
     softmax_denominator =
         wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
@@ -384,12 +389,15 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     }
 
     // now, compute the normalization across all threads.
-    for(int32_t t = n_unrolled_loops * dtt * split_idx + thread_linear_idx; t < tt_tail_high; t += threads_per_block)
+    for(int32_t t = t_low + thread_linear_idx; 
+        t < t_high;
+        t += threads_per_block)
     {
         // softmax scale by sumexp will happen in the reduction kernel
-        smem[t - n_unrolled_loops * dtt * split_idx] = ck::math::exp(smem[t - n_unrolled_loops * dtt * split_idx] - max_qk_acc);
+        smem[t - t_low] = ck::math::exp(smem[t - t_low] - max_qk_acc);
     }
     __syncthreads();
+    } // softmax reduce end
 
     // Split T across wavefronts in a block
     // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]

From f66696599b591621e9b0beeb3eb910816c5386d2 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 24 Jan 2024 01:36:54 +0000
Subject: [PATCH 385/641] clang-format-10

---
 .../hip_fmha/attention_forward_splitk.cpp     | 274 +++++++++++-------
 .../ck_attention_forward_decoder_splitk.h     | 107 ++++---
 2 files changed, 220 insertions(+), 161 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 6a1eb8044..5737fbfbe 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -178,8 +178,8 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     auto H = XQ.size(3);
     auto K = XQ.size(4);
 
-    auto O_splits = at::empty({split_k, B, M, G, H, K}, XQ.options());
-    auto split_max = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
+    auto O_splits     = at::empty({split_k, B, M, G, H, K}, XQ.options());
+    auto split_max    = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
     auto split_sumexp = at::empty_like(split_max);
 
     efficient_attention_forward_decoder_splitk_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
@@ -241,8 +241,13 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m)
 
 // clang-format on
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
-    const at::Tensor& Q, const at::Tensor& K, const at::Tensor& V, const at::Tensor& k_seqlens, const int32_t split_k, const int32_t block_size)
+static std::tuple<at::Tensor, at::Tensor, at::Tensor>
+split_attention_torch(const at::Tensor& Q,
+                      const at::Tensor& K,
+                      const at::Tensor& V,
+                      const at::Tensor& k_seqlens,
+                      const int32_t split_k,
+                      const int32_t block_size)
 {
     auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
 
@@ -250,30 +255,36 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
     std::vector<at::Tensor> m_splits;
     std::vector<at::Tensor> l_splits;
 
-    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+    for(int32_t split_idx = 0; split_idx < split_k; ++split_idx)
+    {
         std::vector<at::Tensor> O_batch;
         std::vector<at::Tensor> m_batch;
         std::vector<at::Tensor> l_batch;
 
-        for(size_t b = 0; b < k_seqlens.numel(); ++b) {
-            auto seqlen = k_seqlens[b].item<int64_t>();
+        for(size_t b = 0; b < k_seqlens.numel(); ++b)
+        {
+            auto seqlen         = k_seqlens[b].item<int64_t>();
             const int64_t t_low = split_idx * (seqlen / split_k / block_size) * block_size;
-            const int64_t t_high = (split_idx + 1 < split_k) 
-                                 ? (1 + split_idx) * (seqlen / split_k / block_size) * block_size
-                                 : seqlen;
+            const int64_t t_high =
+                (split_idx + 1 < split_k)
+                    ? (1 + split_idx) * (seqlen / split_k / block_size) * block_size
+                    : seqlen;
 
             const bool empty = t_low == t_high;
 
-            auto S        = at::einsum("mghk, nghk -> mghn",
-                            {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
-                            /* einsum eval path */ at::nullopt);
-            auto m = empty ? at::empty_like(S) : std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+            auto S = at::einsum(
+                "mghk, nghk -> mghn",
+                {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
+                /* einsum eval path */ at::nullopt);
+            auto m = empty ? at::empty_like(S)
+                           : std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
             auto s = at::exp(at::sub(S, m));
             auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-            auto O        = at::einsum("mghn, nghk -> mghk", 
-                            {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)}, 
-                            /* einsum eval path */ at::nullopt);
-            if (empty) {
+            auto O = at::einsum("mghn, nghk -> mghk",
+                                {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
+                                /* einsum eval path */ at::nullopt);
+            if(empty)
+            {
                 m = at::empty_like(at::slice(O, -1, 0, 1));
                 l = at::zeros_like(m);
                 m.fill_(ck::NumericLimits<float>::Lowest());
@@ -299,36 +310,39 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
     return std::make_tuple(O_cat, m_cat, l_cat);
 }
 
-static at::Tensor
-split_reduce_torch(const at::Tensor& O_splits, const at::Tensor& m_splits, const at::Tensor& l_splits, int32_t split_k)
-{   
-    auto O = at::zeros_like(at::slice(O_splits, 0, 0, 1));
-    auto global_max = at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
+static at::Tensor split_reduce_torch(const at::Tensor& O_splits,
+                                     const at::Tensor& m_splits,
+                                     const at::Tensor& l_splits,
+                                     int32_t split_k)
+{
+    auto O             = at::zeros_like(at::slice(O_splits, 0, 0, 1));
+    auto global_max    = at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
     auto global_sumexp = at::zeros_like(global_max);
 
-    for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
-        auto local_O = at::slice(O_splits, 0, split_idx, split_idx + 1);
-        auto local_max = at::slice(m_splits, -1, split_idx, split_idx + 1);
+    for(int32_t split_idx = 0; split_idx < split_k; ++split_idx)
+    {
+        auto local_O      = at::slice(O_splits, 0, split_idx, split_idx + 1);
+        auto local_max    = at::slice(m_splits, -1, split_idx, split_idx + 1);
         auto local_sumexp = at::slice(l_splits, -1, split_idx, split_idx + 1);
 
         auto log_alpha = at::neg(at::abs(at::sub(local_max, global_max)));
-        auto alpha = at::exp(log_alpha);
+        auto alpha     = at::exp(log_alpha);
         alpha.nan_to_num_(1.);
 
-        auto pick_new = at::less(local_max, global_max);
+        auto pick_new          = at::less(local_max, global_max);
         auto pick_current_coef = at::where(pick_new, 1., alpha);
-        auto pick_new_coef = at::where(pick_new, alpha, 1.);
+        auto pick_new_coef     = at::where(pick_new, alpha, 1.);
 
-        O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
-        global_sumexp = at::add(at::mul(pick_current_coef, global_sumexp), at::mul(pick_new_coef, local_sumexp));
-        global_max = at::max(local_max, global_max);
+        O             = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
+        global_sumexp = at::add(at::mul(pick_current_coef, global_sumexp),
+                                at::mul(pick_new_coef, local_sumexp));
+        global_max    = at::max(local_max, global_max);
     }
-    
+
     return at::div(O, global_sumexp);
 }
 
-static at::Tensor 
-efficient_attention_forward_decoder_splitk_torch(
+static at::Tensor efficient_attention_forward_decoder_splitk_torch(
     const at::Tensor& XQ,                 // [B, 1, G, H, D]
     const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
     const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
@@ -337,8 +351,9 @@ efficient_attention_forward_decoder_splitk_torch(
     int32_t split_k,
     int32_t block_size)
 {
-    auto [O_split, m, l] = split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, split_k, block_size);
-    auto O               = split_reduce_torch(O_split, m, l, split_k);
+    auto [O_split, m, l] =
+        split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, split_k, block_size);
+    auto O = split_reduce_torch(O_split, m, l, split_k);
     return O.reshape_as(XQ);
 }
 
@@ -602,11 +617,10 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
                  const dim3 grid_dim,
                  const dim3 block_dim,
                  const size_t lds_bytes)
-            : 
-              split_O(split_O),
+            : split_O(split_O),
               split_max(split_max),
               split_sumexp(split_sumexp),
-              O(O),              
+              O(O),
               O_size_m(O_size_m),
               O_size_g(O_size_g),
               O_size_h(O_size_h),
@@ -722,12 +736,13 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
 } // namespace tensor_operation
 } // namespace ck
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const at::Tensor& XQ,
-                                                                           const at::Tensor& K,
-                                                                           const at::Tensor& V,
-                                                                           const at::Tensor& seqlen,
-                                                                           const int32_t split_k,
-                                                                           const int32_t wavefronts_per_block)
+static std::tuple<at::Tensor, at::Tensor, at::Tensor>
+split_attention_hip(const at::Tensor& XQ,
+                    const at::Tensor& K,
+                    const at::Tensor& V,
+                    const at::Tensor& seqlen,
+                    const int32_t split_k,
+                    const int32_t wavefronts_per_block)
 {
 
     at::OptionalDeviceGuard guard(XQ.device());
@@ -738,13 +753,14 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
     auto H = XQ.size(3);
     auto D = XQ.size(4);
 
-    double qk_scale        = 1. / sqrt(D);
+    double qk_scale = 1. / sqrt(D);
 
-    auto O                    = at::empty_like(XQ);
-    constexpr auto rank       = 5;
-    auto split_O              = at::zeros({split_k, B, M, G, H, D}, XQ.options());
-    auto split_max            = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat)).fill_(ck::NumericLimits<float>::Lowest());
-    auto split_sumexp         = at::zeros_like(split_max);
+    auto O              = at::empty_like(XQ);
+    constexpr auto rank = 5;
+    auto split_O        = at::zeros({split_k, B, M, G, H, D}, XQ.options());
+    auto split_max      = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat))
+                         .fill_(ck::NumericLimits<float>::Lowest());
+    auto split_sumexp = at::zeros_like(split_max);
 
     dim3 blocks(B * H * M * G, split_k);
     dim3 threads(kThreadsPerWavefront, wavefronts_per_block);
@@ -765,17 +781,18 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
         XQ.scalar_type(),
         "efficient_attention_forward_decoder_split_attention_ck_test",
         [&] {
-            using ck_data_t   = c10_to_data_t<scalar_t>::type;
-            using device_op_t = ck::tensor_operation::device::FMHADecoderSplitAttentionDeviceOp<ck_data_t>;
-            auto op           = device_op_t{};
+            using ck_data_t = c10_to_data_t<scalar_t>::type;
+            using device_op_t =
+                ck::tensor_operation::device::FMHADecoderSplitAttentionDeviceOp<ck_data_t>;
+            auto op = device_op_t{};
 
             auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
             auto K_acc  = K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
             auto V_acc  = V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
             auto split_O_acc =
                 split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-            auto O_acc   = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto seq_acc = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
+            auto O_acc         = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto seq_acc       = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
             auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
             auto split_sumexp_acc =
                 split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
@@ -815,8 +832,11 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(const
     return std::make_tuple(split_O, split_max, split_sumexp);
 }
 
-static
-at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_max, const at::Tensor& split_sumexp, const int32_t split_k) {
+static at::Tensor split_reduce_hip(const at::Tensor& split_O,
+                                   const at::Tensor& split_max,
+                                   const at::Tensor& split_sumexp,
+                                   const int32_t split_k)
+{
     at::OptionalDeviceGuard guard(split_O.device());
 
     auto B = split_O.size(1);
@@ -829,7 +849,7 @@ at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_m
     TORCH_CHECK_EQ(split_k, split_max.size(-1));
     TORCH_CHECK_EQ(split_k, split_sumexp.size(-1));
 
-    constexpr auto rank       = 5;
+    constexpr auto rank = 5;
 
     TORCH_CHECK_EQ(split_O.dim(), 1 + rank);
     TORCH_CHECK_EQ(split_max.dim(), rank);
@@ -837,7 +857,7 @@ at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_m
 
     auto O = at::zeros({B, M, G, H, D}, split_O.options());
 
-    auto stream            = at::cuda::getCurrentHIPStream().stream();
+    auto stream    = at::cuda::getCurrentHIPStream().stream();
     auto lds_bytes = 0;
 
     dim3 blocks(B * H * M * G);
@@ -850,13 +870,14 @@ at::Tensor split_reduce_hip(const at::Tensor& split_O, const at::Tensor& split_m
         O.scalar_type(),
         "efficient_attention_forward_decoder_split_reduce_ck_test",
         [&] {
-            using ck_data_t   = c10_to_data_t<scalar_t>::type;
-            using device_op_t = ck::tensor_operation::device::FMHADecoderSplitReduceDeviceOp<ck_data_t>;
-            auto op           = device_op_t{};
+            using ck_data_t = c10_to_data_t<scalar_t>::type;
+            using device_op_t =
+                ck::tensor_operation::device::FMHADecoderSplitReduceDeviceOp<ck_data_t>;
+            auto op = device_op_t{};
 
             auto split_O_acc =
                 split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-            auto O_acc   = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+            auto O_acc         = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
             auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
             auto split_sumexp_acc =
                 split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
@@ -907,25 +928,29 @@ generate_inputs(const int32_t padding,
     auto XQ          = at::randn({B, num_queries, G, Hq, D}, options);
     auto K           = (G == 1) ? at::randn({B, padding, G, Hkv, D}, options)
                       : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
-    auto V = at::randn_like(K);
+    auto V      = at::randn_like(K);
     auto seqlen = at::randint(num_queries, padding + 1, {B}, int_options);
 
     return std::make_tuple(XQ, K, V, seqlen);
 }
 
-static float percent_mismatch(const at::Tensor& a, const at::Tensor& b) {
-    auto mask = at::isclose(a, b, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+static float percent_mismatch(const at::Tensor& a, const at::Tensor& b)
+{
+    auto mask          = at::isclose(a, b, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
     auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
     return 1. - percent_match.item<float>();
 }
 
-static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
+static void
+test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split_attention_torch(XQ, K, V, seqlen, split_k, /* block_size */ kWavefrontsPerBlock * 16);
+    auto [O_ref, m_ref, l_ref] =
+        split_attention_torch(XQ, K, V, seqlen, split_k, /* block_size */ kWavefrontsPerBlock * 16);
 
-    auto [O_hip, m_hip, l_hip] = split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
+    auto [O_hip, m_hip, l_hip] =
+        split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
 
     auto O_percent_mismatch = percent_mismatch(O_ref, O_hip);
     auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
@@ -935,64 +960,96 @@ static void test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq
     //     std::cout << "ref: " << m_ref << std::endl << "hip: " << m_hip << std::endl;
     // }
 
-    printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched split_sumexp elements percentage: %.2f\n", 
-            padding,
-            batch_size,
-            Hq,
-            Hkv,
-            split_k,
-            O_percent_mismatch,
-            m_percent_mismatch,
-            l_percent_mismatch);
+    printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O "
+           "elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched "
+           "split_sumexp elements percentage: %.2f\n",
+           padding,
+           batch_size,
+           Hq,
+           Hkv,
+           split_k,
+           O_percent_mismatch,
+           m_percent_mismatch,
+           l_percent_mismatch);
 }
 
-static void test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k) {
+static void
+test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
+{
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    auto [O_ref, m_ref, l_ref] = split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
+    auto [O_ref, m_ref, l_ref] =
+        split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
 
     auto O_torch = split_reduce_torch(O_ref, m_ref.unsqueeze(0), l_ref.unsqueeze(0), split_k);
-    auto O_hip = split_reduce_hip(O_ref, m_ref, l_ref, split_k);
+    auto O_hip   = split_reduce_hip(O_ref, m_ref, l_ref, split_k);
 
     auto hip_torch_mismatch = percent_mismatch(O_hip, O_torch);
-    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f \n", 
-        padding, batch_size, Hq, Hkv, split_k, hip_torch_mismatch);
+    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements "
+           "percentage: %.2f \n",
+           padding,
+           batch_size,
+           Hq,
+           Hkv,
+           split_k,
+           hip_torch_mismatch);
 }
 
-static void test_splitk_decoder_e2e_correctness(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
+static void test_splitk_decoder_e2e_correctness(
+    int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
 {
     auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
 
-    double qk_scale        = 1. / sqrt(XQ.size(-1));
+    double qk_scale = 1. / sqrt(XQ.size(-1));
 
-    auto result = efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront, kWavefrontsPerBlock>(
+    auto result = efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront,
+                                                                     kWavefrontsPerBlock>(
         XQ, K, V, seqlen, qk_scale, split_k);
-    auto gold_result = efficient_attention_forward_decoder_splitk_torch(XQ, K, V, seqlen, qk_scale, /* split_k */ 1, /* block_size */ 1);
+    auto gold_result = efficient_attention_forward_decoder_splitk_torch(
+        XQ, K, V, seqlen, qk_scale, /* split_k */ 1, /* block_size */ 1);
     auto e2e_mismatch = percent_mismatch(result, gold_result);
-    printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements percentage: %.2f\n", padding, batch_size, Hq, Hkv, split_k, e2e_mismatch);
+    printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched "
+           "elements percentage: %.2f\n",
+           padding,
+           batch_size,
+           Hq,
+           Hkv,
+           split_k,
+           e2e_mismatch);
 }
 
 int main(int argc, char** argv)
 {
     if(argc == 1)
     {
-        for (auto padding : {32, 4096}) {
-            for (auto batch_size : {1, 8}) {
-                for (auto Hq : { 16 }) {
-                    for (auto Hkv : { 16 }) {
-                        for (auto split_k : {1, 2, 4, 8, 16}) {
-                            test_splitk_decoder_e2e_correctness(padding, batch_size, Hq, Hkv, split_k);
+        for(auto padding : {32, 4096})
+        {
+            for(auto batch_size : {1, 8})
+            {
+                for(auto Hq : {16})
+                {
+                    for(auto Hkv : {16})
+                    {
+                        for(auto split_k : {1, 2, 4, 8, 16})
+                        {
+                            test_splitk_decoder_e2e_correctness(
+                                padding, batch_size, Hq, Hkv, split_k);
                         }
                     }
                 }
             }
         }
 
-        for (auto padding : {32, 4096}) {
-            for (auto batch_size : {1, 8}) {
-                for (auto Hq : { 16 }) {
-                    for (auto Hkv : { 16 }) {
-                        for (auto split_k : {1, 2, 4, 8, 16}) {
+        for(auto padding : {32, 4096})
+        {
+            for(auto batch_size : {1, 8})
+            {
+                for(auto Hq : {16})
+                {
+                    for(auto Hkv : {16})
+                    {
+                        for(auto split_k : {1, 2, 4, 8, 16})
+                        {
                             test_split_attention(padding, batch_size, Hq, Hkv, split_k);
                         }
                     }
@@ -1000,11 +1057,16 @@ int main(int argc, char** argv)
             }
         }
 
-        for (auto padding : {32, 4096}) {
-            for (auto batch_size : {1, 8}) {
-                for (auto Hq : { 16 }) {
-                    for (auto Hkv : { 16 }) {
-                        for (auto split_k : {1, 2}) {
+        for(auto padding : {32, 4096})
+        {
+            for(auto batch_size : {1, 8})
+            {
+                for(auto Hq : {16})
+                {
+                    for(auto Hkv : {16})
+                    {
+                        for(auto split_k : {1, 2})
+                        {
                             test_split_reduce(padding, batch_size, Hq, Hkv, split_k);
                         }
                     }
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 5237231ff..bdd51d596 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -133,16 +133,16 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
         {
             O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
         }
-        compute_t local_max         = *(split_max + blockIdx.x * split_k + split_idx);
-        compute_t local_sumexp      = *(split_sumexp + blockIdx.x * split_k + split_idx);
-        
-        compute_t log_alpha         = -std::abs(local_max - global_max);
-        compute_t alpha             = isnan(log_alpha) ? compute_t{1.} : ck::math::exp(log_alpha);
-        
+        compute_t local_max    = *(split_max + blockIdx.x * split_k + split_idx);
+        compute_t local_sumexp = *(split_sumexp + blockIdx.x * split_k + split_idx);
+
+        compute_t log_alpha = -std::abs(local_max - global_max);
+        compute_t alpha     = isnan(log_alpha) ? compute_t{1.} : ck::math::exp(log_alpha);
+
         bool pick_new               = local_max < global_max;
         compute_t pick_current_coef = pick_new ? 1. : alpha;
         compute_t pick_new_coef     = pick_new ? alpha : 1.;
-        
+
         global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
         global_O_compute.vec =
             pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
@@ -207,8 +207,8 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
     // tokens.
     const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
 
-    const int32_t lane_idx              = threadIdx.x;
-    const int32_t wavefront_idx         = threadIdx.y;
+    const int32_t lane_idx      = threadIdx.x;
+    const int32_t wavefront_idx = threadIdx.y;
     // TODO: `threads_per_wavefront` and `wavefronts_per_block` may be compile time constants;
     // investigate when optimizing
     const int32_t threads_per_wavefront = blockDim.x;
@@ -255,7 +255,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
 
     data_vec_t k_loads[n_loop_unroll] = {};
 
-    const auto dtt              = wavefronts_per_block * n_loop_unroll;
+    const auto dtt = wavefronts_per_block * n_loop_unroll;
     // only last split gets the tail.
     // the first (split_k - 1) splits have a number of iterations divisible by `dtt`
     const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
@@ -283,12 +283,11 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
         for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
         {
             compute_t qk_acc = 0;
-            ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                q_thread, k_loads[ttt], qk_acc);
-            qk_acc  *= qk_scale;
+            ck::inner_product<data_vec_t, data_vec_t, compute_t>(q_thread, k_loads[ttt], qk_acc);
+            qk_acc *= qk_scale;
 
-            qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-            max_qk_acc   = ck::math::max(qk_acc, max_qk_acc);
+            qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+            max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
             if(lane_idx == 0)
             {
                 smem[tt + ttt - n_unrolled_loops * dtt * split_idx] = qk_acc;
@@ -356,47 +355,44 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
 
     // each wavefront computes partial sum of exp.
     { // softmax reduce begin
-    compute_t softmax_denominator = 0.0f;
-    const int32_t t_low = n_unrolled_loops * dtt * split_idx;
-    const int32_t t_high = (split_idx + 1 < split_k) ? n_unrolled_loops * dtt * (split_idx + 1) : t_max;
-    for(int32_t t = t_low + thread_linear_idx; 
-        t < t_high; 
-        t += threads_per_block)
-    {
-        softmax_denominator += ck::math::exp(smem[t - t_low] - max_qk_acc);
-    }
-    softmax_denominator =
-        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+        compute_t softmax_denominator = 0.0f;
+        const int32_t t_low           = n_unrolled_loops * dtt * split_idx;
+        const int32_t t_high =
+            (split_idx + 1 < split_k) ? n_unrolled_loops * dtt * (split_idx + 1) : t_max;
+        for(int32_t t = t_low + thread_linear_idx; t < t_high; t += threads_per_block)
+        {
+            softmax_denominator += ck::math::exp(smem[t - t_low] - max_qk_acc);
+        }
+        softmax_denominator =
+            wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
 
-    if(lane_idx == 0)
-    {
-        smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
-    }
-    __syncthreads();
+        if(lane_idx == 0)
+        {
+            smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
+        }
+        __syncthreads();
 
-    // now, compute sum of exp(x - max(x)) over all intermediate results.
-    softmax_denominator = 0.0;
-    if(lane_idx < wavefronts_per_block)
-    {
-        softmax_denominator = smem[KV_M_MAX + lane_idx];
-    }
-    softmax_denominator =
-        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+        // now, compute sum of exp(x - max(x)) over all intermediate results.
+        softmax_denominator = 0.0;
+        if(lane_idx < wavefronts_per_block)
+        {
+            softmax_denominator = smem[KV_M_MAX + lane_idx];
+        }
+        softmax_denominator =
+            wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
 
-    if(wavefront_idx == 0 && lane_idx == 0)
-    {
-        split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
-    }
+        if(wavefront_idx == 0 && lane_idx == 0)
+        {
+            split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
+        }
 
-    // now, compute the normalization across all threads.
-    for(int32_t t = t_low + thread_linear_idx; 
-        t < t_high;
-        t += threads_per_block)
-    {
-        // softmax scale by sumexp will happen in the reduction kernel
-        smem[t - t_low] = ck::math::exp(smem[t - t_low] - max_qk_acc);
-    }
-    __syncthreads();
+        // now, compute the normalization across all threads.
+        for(int32_t t = t_low + thread_linear_idx; t < t_high; t += threads_per_block)
+        {
+            // softmax scale by sumexp will happen in the reduction kernel
+            smem[t - t_low] = ck::math::exp(smem[t - t_low] - max_qk_acc);
+        }
+        __syncthreads();
     } // softmax reduce end
 
     // Split T across wavefronts in a block
@@ -439,7 +435,7 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
                     load_v<data_t, data_vec_t>(
                         cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
                     ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
-                    o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+                    o_acc   = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
                 }
             }
         }
@@ -632,8 +628,9 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator
         using Argument = DeviceOp::Argument;
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ << std::endl;
-            
+            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ <<
+            // std::endl;
+
             auto threads_per_wavefront = arg.block_dim.x;
 
             auto Q_size_k_alignment_necessary = 0;

From ecaf6239154e98cd1ae8be3631494154942fd529 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 24 Jan 2024 16:24:32 +0000
Subject: [PATCH 386/641] Fix v_dram_transposed transpose transform in the
 kernel

---
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 57 +++----------------
 1 file changed, 7 insertions(+), 50 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index acabd1e7a..6240a6d6d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -472,56 +472,13 @@ struct FmhaFwdKernel
                     transform_tensor_view(v_dram_naive,
                                           make_tuple(make_pass_through_transform(kargs.seqlen_k),
                                                      make_pass_through_transform(kargs.hdim_v)),
-                                          make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                          make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-                /// FIXME: The return value of v_dram_naive.GetTensorDescriptor().GetLength() is
-                /// same as
-                ///   v_dram_transposed.GetTensorDescriptor().GetLength(). Replace following
-                ///   if-clause by pad_tensor_view() call after fixing this issue.
-                if constexpr(kK0N1NeedPadding || kN0K1NeedPadding)
-                {
-                    const auto transform_n1 = [&] {
-                        if constexpr(kK0N1NeedPadding)
-                        {
-                            const index_t n1_pad_length =
-                                FmhaPipeline::kN1 *
-                                    ck::math::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1) -
-                                kargs.hdim_v;
-
-                            return make_right_pad_transform(kargs.hdim_v, n1_pad_length);
-                        }
-                        else
-                        {
-                            return make_pass_through_transform(kargs.hdim_v);
-                        }
-                    }();
-
-                    const auto transform_k1 = [&] {
-                        if constexpr(kN0K1NeedPadding)
-                        {
-                            const index_t k1_pad_length =
-                                FmhaPipeline::kK1 * ck::math::integer_divide_ceil(
-                                                        kargs.seqlen_k, FmhaPipeline::kK1) -
-                                kargs.seqlen_k;
-
-                            return make_right_pad_transform(kargs.seqlen_k, k1_pad_length);
-                        }
-                        else
-                        {
-                            return make_pass_through_transform(kargs.seqlen_k);
-                        }
-                    }();
-
-                    return transform_tensor_view(v_dram_transposed,
-                                                 make_tuple(transform_n1, transform_k1),
-                                                 make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                                 make_tuple(Sequence<0>{}, Sequence<1>{}));
-                }
-                else
-                {
-                    return v_dram_transposed;
-                }
+                                          make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                          make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+                return pad_tensor_view(
+                    v_dram_transposed,
+                    make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+                    Sequence<kK0N1NeedPadding, kN0K1NeedPadding>{});
             }
             else
             {

From 8b337bd3ce9a2b5ba20ad98ed682da8bd713e343 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 24 Jan 2024 16:25:38 +0000
Subject: [PATCH 387/641] Skipe trition_splitk for test_forward in
 test_mem_eff_attention.py

---
 tests/test_mem_eff_attention.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index a1ca3b089..2b841e641 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -456,6 +456,10 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         k,
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    if torch.version.hip and op is fmha.triton_splitk.FwOp:
+        pytest.skip("trition_splitk Fwd is not supported on ROCm!")
+
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"

From ee577e204cd6bab6498dbf475e2e08b8b03f50fa Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 24 Jan 2024 17:41:05 +0000
Subject: [PATCH 388/641] cleanup commented dead code

---
 .../attention/hip_fmha/attention_forward_splitk.cpp | 13 -------------
 .../hip_fmha/ck_attention_forward_decoder_splitk.h  |  4 ----
 2 files changed, 17 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 5737fbfbe..de3ed88a7 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -503,12 +503,7 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator
         using Argument = DeviceOp::Argument;
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-
-            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ <<
-            // std::endl;
-
             auto threads_per_wavefront = arg.block_dim.x;
-
             auto Q_size_k_alignment_necessary = 0;
 
             for(auto vec_size : {4, 2, 1})
@@ -673,10 +668,6 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             auto threads_per_wavefront = arg.block_dim.x;
-
-            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ <<
-            // std::endl;
-
             auto O_size_k_alignment_necessary = 0;
 
             for(auto vec_size : {4, 2, 1})
@@ -956,10 +947,6 @@ test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hk
     auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
     auto l_percent_mismatch = percent_mismatch(l_ref, l_hip);
 
-    // if (m_percent_mismatch > 0) {
-    //     std::cout << "ref: " << m_ref << std::endl << "hip: " << m_hip << std::endl;
-    // }
-
     printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O "
            "elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched "
            "split_sumexp elements percentage: %.2f\n",
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index bdd51d596..316a5d497 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -628,11 +628,7 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator
         using Argument = DeviceOp::Argument;
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            // std::cout << arg.str() << std::endl << "stream_id: " << stream_config.stream_id_ <<
-            // std::endl;
-
             auto threads_per_wavefront = arg.block_dim.x;
-
             auto Q_size_k_alignment_necessary = 0;
 
             for(auto vec_size : {4, 2, 1})

From a21ac038579195ee0f763c90400d3be48eb74d68 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 24 Jan 2024 18:10:54 +0000
Subject: [PATCH 389/641] enable ck split-k in benchmark_attn_decoding

---
 xformers/benchmarks/benchmark_attn_decoding.py               | 5 +++++
 .../csrc/attention/hip_fmha/attention_forward_splitk.cpp     | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 4174ed4fc..e56964d03 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -108,6 +108,10 @@ class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
     OP = xops.fmha.triton_splitk.FwOp
 
 
+class AttentionDecodingCKSplitKV(AttentionDecodingFlashDecoding):
+    OP = xops.fmha.forward_splitk.FwOp
+ 
+
 class AttentionDecodingPyTorchRepeat(AttentionDecodingFlashDecoding):
     def fw(self) -> None:
         B, Mq, Mkv, Hq, Hkv, K = self.shapes
@@ -125,6 +129,7 @@ def fw(self) -> None:
     "ck-decoder": AttentionDecodingCKDecoder,
     "flash-decoding": AttentionDecodingFlashDecoding,
     "triton_splitK": AttentionDecodingSplitKV,
+    "ck_splitK": AttentionDecodingCKSplitKV,
 }
 
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index de3ed88a7..833b152eb 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -8,7 +8,7 @@
 
 namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock  = 8;
+constexpr int32_t kWavefrontsPerBlock  = 16;
 constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
 } // namespace
 
@@ -72,7 +72,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
 
     TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
 
-    TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
+    TORCH_CHECK(cache_K.size(1) / split_k <= KV_M_MAX);
     TORCH_CHECK(cache_K.size(4) <= K_MAX);
 
     constexpr auto rank = 5;

From 5e3213f3c949df2c0dbba3bcaf1fb37f3c630f6d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 24 Jan 2024 21:28:35 +0000
Subject: [PATCH 390/641] add rocm_ci workflow

---
 .github/workflows/rocm_ci.yml | 71 +++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 .github/workflows/rocm_ci.yml

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
new file mode 100644
index 000000000..6d36a7e97
--- /dev/null
+++ b/.github/workflows/rocm_ci.yml
@@ -0,0 +1,71 @@
+name: ROCM_CI
+
+on: 
+  pull_request:
+    types: [labeled, synchronize, reopened]
+
+jobs:
+  build:
+    if: contains(github.event.label.name, 'rocm')
+    runs-on: rocm
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Get CPU info on Ubuntu
+      if: contains(runner.os, 'linux')
+      run: |
+        cat /proc/cpuinfo
+    - name: Get env vars
+      run: |
+        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
+        echo HOME              = $HOME
+        echo PWD               = $PWD
+        echo GITHUB_ACTION     = $GITHUB_ACTION
+        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
+        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
+        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
+        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
+        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
+        echo GITHUB_SHA        = $GITHUB_SHA
+        echo GITHUB_REF        = $GITHUB_REF
+
+        export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
+        echo GIT_BRANCH        = $GIT_BRANCH
+        
+        export ROCM_PATH=/opt/rocm
+        echo ROCM_PATH         = $ROCM_PATH
+
+        export MAX_JOBS=64
+        echo MAX_JOBS          = $MAX_JOBS
+
+        hipcc --version
+        rocm-smi
+        rocminfo | grep "gfx"
+
+    - name: Build XFormers
+      run: |
+        git clone --recursive -b $GIT_BRANCH $GITHUB_REPOSITORY
+        docker run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G -v $PWD/xformers:/xformers  rocm/pytorch-nightly:latest
+
+        pip3 install --upgrade pip
+        pip3 uninstall -y xformers
+        MAX_JOBS=$MAX_JOBS pip3 install -e /xformers --verbose
+        pip3 install scipy==1.10
+
+        python3 -c "import torch; print(torch.__version__)"
+        python3 -m xformers.info
+
+    - name: Run python tests
+      run: |
+        pytest -rpfs /xformers/tests/test_mem_eff_attention_ck.py | tee test_mem_eff_attention_ck.log
+    
+    - name: Archive logs
+      uses: actions/upload-artifact@v3
+      with:
+        name: test results
+        path: test_mem_eff_attention_ck.log
+
+    - name: Process test results
+      run: |
+        echo "Processing test results TBD"
+      

From 0e47337a5456c12456fcba2bb43075632be72e92 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 25 Jan 2024 19:17:29 +0000
Subject: [PATCH 391/641] move scipy import from file level under function
 similar to _vec_binom_test

saves a few keystrokes when setting up environment
---
 tests/test_mem_eff_attention_ck.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index f569e1d63..5f2fc57cb 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -11,7 +11,6 @@
 import pytest
 import torch
 import torch.nn.functional as F
-from scipy.stats import binomtest
 from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
@@ -939,6 +938,8 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 @pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
+    from scipy.stats import binomtest
+
     device = "cuda"
     scale = 0.05 
     query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale

From 360201f1efb72200ee7ceaafff52cc68663f3093 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jan 2024 22:46:11 +0000
Subject: [PATCH 392/641] Add including of math_v2.hpp to
 ck_attention_forward_decoder_splitk.h

---
 .../attention/hip_fmha/ck_attention_forward_decoder_splitk.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 316a5d497..f83ab9dcc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -6,6 +6,7 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
+#include <ck/utility/math_v2.hpp>
 
 namespace {
 
@@ -628,7 +629,7 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator
         using Argument = DeviceOp::Argument;
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            auto threads_per_wavefront = arg.block_dim.x;
+            auto threads_per_wavefront        = arg.block_dim.x;
             auto Q_size_k_alignment_necessary = 0;
 
             for(auto vec_size : {4, 2, 1})
@@ -723,4 +724,4 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator
 };
 } // namespace device
 } // namespace tensor_operation
-} // namespace ck
\ No newline at end of file
+} // namespace ck

From faf1b166ed391df2293852b4644f681d1a7dee51 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 29 Jan 2024 21:54:04 +0000
Subject: [PATCH 393/641] move forward_splitk to ck_splitk; make dispatch aware
 of ck_splitk and ck_decoder

---
 tests/test_mem_eff_attention_ck.py                    | 2 +-
 xformers/ops/fmha/__init__.py                         | 4 ++--
 xformers/ops/fmha/{forward_splitk.py => ck_splitk.py} | 0
 xformers/ops/fmha/dispatch.py                         | 6 ++++--
 4 files changed, 7 insertions(+), 5 deletions(-)
 rename xformers/ops/fmha/{forward_splitk.py => ck_splitk.py} (100%)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck.py
index 5f2fc57cb..633ad761b 100644
--- a/tests/test_mem_eff_attention_ck.py
+++ b/tests/test_mem_eff_attention_ck.py
@@ -1769,7 +1769,7 @@ def test_decoder(
     )
 
 
-@pytest.mark.parametrize("op", [fmha.forward_splitk.FwOp_S1, fmha.forward_splitk.FwOp_S2, fmha.forward_splitk.FwOp_S4])
+@pytest.mark.parametrize("op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4])
 @pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 589047ce9..06b995c30 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -8,7 +8,7 @@
 import torch
 
 
-from . import attn_bias, cutlass, decoder, flash, small_k, triton, triton_splitk, forward_splitk, ck, ck_decoder
+from . import attn_bias, cutlass, decoder, flash, small_k, triton, triton_splitk, ck, ck_decoder, ck_splitk
 from .attn_bias import AttentionBias, BlockDiagonalMask, LowerTriangularMask
 from .common import (
     AttentionBwOpBase,
@@ -32,7 +32,7 @@
 TritonFlashAttentionOp = (triton.FwOp, cutlass.BwOp if torch.version.cuda else ck.BwOp)
 MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp)
 MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
-MemoryEfficientAttentionSplitKCkOp = (forward_splitk.FwOp, ck.BwOp) 
+MemoryEfficientAttentionSplitKCkOp = (ck_splitk.FwOp, ck.BwOp) 
 
 class _fMHA(torch.autograd.Function):
     @staticmethod
diff --git a/xformers/ops/fmha/forward_splitk.py b/xformers/ops/fmha/ck_splitk.py
similarity index 100%
rename from xformers/ops/fmha/forward_splitk.py
rename to xformers/ops/fmha/ck_splitk.py
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index c9708770b..7113855cb 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -5,10 +5,11 @@
 
 
 import textwrap
+import torch
 from collections import deque
 from typing import List, Sequence, Type, TypeVar
 
-from . import attn_bias, cutlass, decoder, flash, small_k, triton, triton_splitk
+from . import attn_bias, cutlass, decoder, flash, small_k, triton, triton_splitk, ck, ck_decoder, ck_splitk
 from .common import AttentionBwOpBase, AttentionFwOpBase, Inputs
 
 
@@ -93,7 +94,7 @@ def _dispatch_fw_priority_list(
         if not mqa_or_gqa:
             # With multiquery, cutlass is sometimes faster than decoder
             # but it's not currently clear when.
-            priority_list_ops.appendleft(decoder.FwOp)
+            priority_list_ops.appendleft(decoder.FwOp if torch.version.cuda else ck_decoder.FwOp)
         # Split-KV is useful with MQA
         # for short Q-seqlen / long K-seqlen
         if mqa_or_gqa and inp.query.shape[1] <= 32 and inp.key.shape[1] >= 256:
@@ -105,6 +106,7 @@ def _dispatch_fw_priority_list(
             elif inp.query.ndim == 5:  # BMGHK
                 parallelism_BH = inp.query.shape[0] * inp.query.shape[2]
             if parallelism_BH > 0 and parallelism_BH < 64:
+                priority_list_ops.appendleft(ck_splitk.FwOp)
                 priority_list_ops.appendleft(triton_splitk.FwOp)
                 # Without variable seqlen flash is fastest
                 if not isinstance(inp.attn_bias, attn_bias.BlockDiagonalMask):

From 323ebae0efb9f33b553d8702dbcb1f7f829f0208 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 30 Jan 2024 15:44:55 +0000
Subject: [PATCH 394/641] Synchronize to latest ck-tiled and update accordingly

---
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 66 ++++++++++---------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 66 ++++++++++---------
 .../hip_fmha/ck_tiled_fmha_definitions.h      | 12 ++--
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 33 +++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 33 +++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 33 +++++-----
 7 files changed, 128 insertions(+), 117 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 73166db69..52b621ecf 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 73166db6920afac53189098acf4774f9fa929143
+Subproject commit 52b621ecf3533514031670dd99b6f2059832baaa
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index dd684d9f2..2f15bb2c7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -53,7 +53,6 @@ struct batched_forward_causalmask_attnbias_dispatched
         typename FmhaFwdTypeConfig<scalar_t>::PDataType,
         typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        HDim == 32 ? 128 : 256, // BlockSize
         FmhaFwdShape<HDim>,
         false, // kIsGroupMode
         FmhaMask,
@@ -71,28 +70,31 @@ struct batched_forward_causalmask_attnbias_dispatched
 
             using FmhaShape                 = FmhaFwdShape<HDim>;
             using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
 
-            bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
-            bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
+            bool pad_seqlen_q  = !(param.M % FmhaShape::kM0 == 0);
+            bool pad_seqlen_k  = !(param.N % FmhaShape::kN0 == 0);
+            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
                 // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-                BOOL_SWITCH_3(
-                    m0_need_padding,
-                    kM0NeedPadding,
-                    n0k1_need_padding,
-                    kN0K1NeedPadding,
-                    k0n1_need_padding,
-                    kK0N1NeedPadding,
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
+
+                BOOL_SWITCH_4(
+                    pad_seqlen_q,
+                    kPadSeqLenQ,
+                    pad_seqlen_k,
+                    kPadSeqLenK,
+                    pad_headdim_q,
+                    kPadHeadDimQ,
+                    pad_headdim_v,
+                    kPadHeadDimV,
                     [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                            kN0K1NeedPadding,
-                                                                            kK0N1NeedPadding,
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                            kPadSeqLenK,
+                                                                            kPadHeadDimQ,
+                                                                            kPadHeadDimV,
                                                                             has_attn_bias,
                                                                             true, // kStoreLSE
                                                                             occupancy>;
@@ -110,20 +112,22 @@ struct batched_forward_causalmask_attnbias_dispatched
             else
             {
                 // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-                BOOL_SWITCH_3(
-                    m0_need_padding,
-                    kM0NeedPadding,
-                    n0k1_need_padding,
-                    kN0K1NeedPadding,
-                    k0n1_need_padding,
-                    kK0N1NeedPadding,
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+
+                BOOL_SWITCH_4(
+                    pad_seqlen_q,
+                    kPadSeqLenQ,
+                    pad_seqlen_k,
+                    kPadSeqLenK,
+                    pad_headdim_q,
+                    kPadHeadDimQ,
+                    pad_headdim_v,
+                    kPadHeadDimV,
                     [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                            kN0K1NeedPadding,
-                                                                            kK0N1NeedPadding,
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                            kPadSeqLenK,
+                                                                            kPadHeadDimQ,
+                                                                            kPadHeadDimV,
                                                                             has_attn_bias,
                                                                             true, // kStoreLSE
                                                                             occupancy>;
@@ -131,7 +135,7 @@ struct batched_forward_causalmask_attnbias_dispatched
                         using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
                         constexpr bool no_any_padding =
-                            !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
+                            !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
 
                         if constexpr(no_any_padding)
                         {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 4ebe09304..526ef6205 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -53,7 +53,6 @@ struct batched_infer_causalmask_attnbias_dispatched
         typename FmhaFwdTypeConfig<scalar_t>::PDataType,
         typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        HDim == 32 ? 128 : 256, // BlockSize
         FmhaFwdShape<HDim>,
         false, // kIsGroupMode
         FmhaMask,
@@ -71,28 +70,31 @@ struct batched_infer_causalmask_attnbias_dispatched
 
             using FmhaShape                 = FmhaFwdShape<HDim>;
             using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
 
-            bool m0_need_padding   = !(param.M % FmhaShape::kM0 == 0);
-            bool n0k1_need_padding = !(param.N % FmhaShape::kN0 == 0);
+            bool pad_seqlen_q  = !(param.M % FmhaShape::kM0 == 0);
+            bool pad_seqlen_k  = !(param.N % FmhaShape::kN0 == 0);
+            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
                 // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-                BOOL_SWITCH_3(
-                    m0_need_padding,
-                    kM0NeedPadding,
-                    n0k1_need_padding,
-                    kN0K1NeedPadding,
-                    k0n1_need_padding,
-                    kK0N1NeedPadding,
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
+
+                BOOL_SWITCH_4(
+                    pad_seqlen_q,
+                    kPadSeqLenQ,
+                    pad_seqlen_k,
+                    kPadSeqLenK,
+                    pad_headdim_q,
+                    kPadHeadDimQ,
+                    pad_headdim_v,
+                    kPadHeadDimV,
                     [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                            kN0K1NeedPadding,
-                                                                            kK0N1NeedPadding,
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                            kPadSeqLenK,
+                                                                            kPadHeadDimQ,
+                                                                            kPadHeadDimV,
                                                                             has_attn_bias,
                                                                             false, // kStoreLSE
                                                                             occupancy>;
@@ -110,20 +112,22 @@ struct batched_infer_causalmask_attnbias_dispatched
             else
             {
                 // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
-
-                BOOL_SWITCH_3(
-                    m0_need_padding,
-                    kM0NeedPadding,
-                    n0k1_need_padding,
-                    kN0K1NeedPadding,
-                    k0n1_need_padding,
-                    kK0N1NeedPadding,
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+
+                BOOL_SWITCH_4(
+                    pad_seqlen_q,
+                    kPadSeqLenQ,
+                    pad_seqlen_k,
+                    kPadSeqLenK,
+                    pad_headdim_q,
+                    kPadHeadDimQ,
+                    pad_headdim_v,
+                    kPadHeadDimV,
                     [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                            kN0K1NeedPadding,
-                                                                            kK0N1NeedPadding,
+                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                            kPadSeqLenK,
+                                                                            kPadHeadDimQ,
+                                                                            kPadHeadDimV,
                                                                             has_attn_bias,
                                                                             false, // kStoreLSE
                                                                             occupancy>;
@@ -131,7 +135,7 @@ struct batched_infer_causalmask_attnbias_dispatched
                         using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
                         constexpr bool no_any_padding =
-                            !(kM0NeedPadding || kN0K1NeedPadding || kK0N1NeedPadding);
+                            !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
 
                         if constexpr(no_any_padding)
                         {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index 624efa70d..8444f097a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -48,8 +48,6 @@ struct FmhaFwdTypeConfig<ck::bhalf_t>
     using ODataType           = ck::bhalf_t;
 };
 
-using FmhaFwdVLayout = ck::tensor_layout::gemm::RowMajor;
-
 template <ck::index_t HDim>
 struct FmhaFwdBlockTile;
 
@@ -80,6 +78,8 @@ struct FmhaFwdBlockTile<256>
 using FmhaFwdBlockWarps = ck::Sequence<4, 1, 1>;
 using FmhaFwdWarpTile   = ck::Sequence<32, 32, 16>;
 
+static constexpr bool IsVLayoutRowMajor = true;
+
 template <ck::index_t HDim>
 struct FmhaFwdShape;
 
@@ -89,7 +89,7 @@ struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockT
                                                           FmhaFwdWarpTile,
                                                           ck::Sequence<2, 1, 1>,
                                                           FmhaFwdWarpTile,
-                                                          FmhaFwdVLayout>
+                                                          IsVLayoutRowMajor>
 {
 };
 
@@ -99,7 +99,7 @@ struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockT
                                                           FmhaFwdWarpTile,
                                                           FmhaFwdBlockWarps,
                                                           FmhaFwdWarpTile,
-                                                          FmhaFwdVLayout>
+                                                          IsVLayoutRowMajor>
 {
 };
 
@@ -109,7 +109,7 @@ struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlock
                                                            FmhaFwdWarpTile,
                                                            FmhaFwdBlockWarps,
                                                            FmhaFwdWarpTile,
-                                                           FmhaFwdVLayout>
+                                                           IsVLayoutRowMajor>
 {
 };
 
@@ -119,6 +119,6 @@ struct FmhaFwdShape<256> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlock
                                                            FmhaFwdWarpTile,
                                                            FmhaFwdBlockWarps,
                                                            FmhaFwdWarpTile,
-                                                           FmhaFwdVLayout>
+                                                           IsVLayoutRowMajor>
 {
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 6240a6d6d..542fed4f1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -39,14 +39,15 @@ struct FmhaFwdKernel
 
     using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
 
-    static constexpr bool kIsGroupMode     = FmhaPipeline::kIsGroupMode;
-    static constexpr bool kM0NeedPadding   = FmhaPipeline::kM0NeedPadding;
-    static constexpr bool kN0K1NeedPadding = FmhaPipeline::kN0K1NeedPadding;
-    static constexpr bool kK0N1NeedPadding = FmhaPipeline::kK0N1NeedPadding;
-    static constexpr bool kHasBias         = FmhaPipeline::kHasBias;
-    static constexpr bool kStoreLSE        = FmhaPipeline::kStoreLSE;
-    using FmhaMask                         = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
-    static constexpr bool kHasMask         = FmhaMask::IsMasking;
+    static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ  = FmhaPipeline::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK  = FmhaPipeline::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
+    static constexpr bool kHasBias     = FmhaPipeline::kHasBias;
+    static constexpr bool kStoreLSE    = FmhaPipeline::kStoreLSE;
+    using FmhaMask                     = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    static constexpr bool kHasMask     = FmhaMask::IsMasking;
 
     template <ck::index_t I> // to avoid duplicated base class prblem, introduce an template arg
     struct FmhaFwdEmptyKargs
@@ -435,14 +436,14 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0BlockLength>{}),
-                    Sequence<kM0NeedPadding, kK0N1NeedPadding>{});
+                    Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
             }
             else
             {
                 return pad_tensor_view(
                     q_dram_naive,
                     make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{}),
-                    Sequence<kM0NeedPadding, kK0N1NeedPadding>{});
+                    Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
             }
         }();
         const auto k_dram = [&]() {
@@ -456,7 +457,7 @@ struct FmhaFwdKernel
             return pad_tensor_view(
                 k_dram_naive,
                 make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
-                Sequence<kN0K1NeedPadding, kK0N1NeedPadding>{});
+                Sequence<kPadSeqLenK, kPadHeadDimQ>{});
         }();
         const auto v_dram = [&]() {
             if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
@@ -478,7 +479,7 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     v_dram_transposed,
                     make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-                    Sequence<kK0N1NeedPadding, kN0K1NeedPadding>{});
+                    Sequence<kPadHeadDimV, kPadSeqLenK>{});
             }
             else
             {
@@ -492,7 +493,7 @@ struct FmhaFwdKernel
                 return pad_tensor_view(
                     v_dram_naive,
                     make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-                    Sequence<kK0N1NeedPadding, kN0K1NeedPadding>{});
+                    Sequence<kPadHeadDimV, kPadSeqLenK>{});
             }
         }();
 
@@ -537,7 +538,7 @@ struct FmhaFwdKernel
 
                     return pad_tensor_view(bias_dram_naive,
                                            bias_dram_window_lengths,
-                                           Sequence<kM0NeedPadding, kN0K1NeedPadding>{});
+                                           Sequence<kPadSeqLenQ, kPadSeqLenK>{});
                 }();
 
                 return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
@@ -566,7 +567,7 @@ struct FmhaFwdKernel
                                                                          Number<1>{});
 
                     return pad_tensor_view(
-                        lse_dram_naive, lse_dram_window_lengths, Sequence<kM0NeedPadding>{});
+                        lse_dram_naive, lse_dram_window_lengths, Sequence<kPadSeqLenQ>{});
                 }();
 
                 return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
@@ -652,7 +653,7 @@ struct FmhaFwdKernel
             return pad_tensor_view(
                 o_dram_naive,
                 make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-                Sequence<kM0NeedPadding, kK0N1NeedPadding>{});
+                Sequence<kPadSeqLenQ, kPadHeadDimV>{});
         }();
 
         auto o_dram_window =
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 9e784052c..4b4eb602d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -53,7 +53,6 @@ struct grouped_forward_causalmask_attnbias_dispatched
         typename FmhaFwdTypeConfig<scalar_t>::PDataType,
         typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        HDim == 32 ? 128 : 256, // BlockSize
         FmhaFwdShape<HDim>,
         true, // kIsGroupMode
         FmhaMask,
@@ -71,21 +70,23 @@ struct grouped_forward_causalmask_attnbias_dispatched
 
             using FmhaShape                 = FmhaFwdShape<HDim>;
             using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : (HDim == 256) ? 1 : 2;
 
-            constexpr bool kM0NeedPadding   = true;
-            constexpr bool kN0K1NeedPadding = true;
+            constexpr bool kPadSeqLenQ = true;
+            constexpr bool kPadSeqLenK = true;
+
+            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
                 // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
 
-                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                        kN0K1NeedPadding,
-                                                                        kK0N1NeedPadding,
+                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                        kPadSeqLenK,
+                                                                        kPadHeadDimQ,
+                                                                        kPadHeadDimV,
                                                                         has_attn_bias,
                                                                         true, // kStoreLSE
                                                                         occupancy>;
@@ -103,13 +104,13 @@ struct grouped_forward_causalmask_attnbias_dispatched
             else
             {
                 // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 
-                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                        kN0K1NeedPadding,
-                                                                        kK0N1NeedPadding,
+                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                        kPadSeqLenK,
+                                                                        kPadHeadDimQ,
+                                                                        kPadHeadDimV,
                                                                         has_attn_bias,
                                                                         true, // kStoreLSE
                                                                         occupancy>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 2909ee5fa..ee7713317 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -53,7 +53,6 @@ struct grouped_infer_causalmask_attnbias_dispatched
         typename FmhaFwdTypeConfig<scalar_t>::PDataType,
         typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
         typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        HDim == 32 ? 128 : 256, // BlockSize
         FmhaFwdShape<HDim>,
         true, // kIsGroupMode
         FmhaMask,
@@ -71,21 +70,23 @@ struct grouped_infer_causalmask_attnbias_dispatched
 
             using FmhaShape                 = FmhaFwdShape<HDim>;
             using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : 2;
+            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
 
-            constexpr bool kM0NeedPadding   = true;
-            constexpr bool kN0K1NeedPadding = true;
+            constexpr bool kPadSeqLenQ = true;
+            constexpr bool kPadSeqLenK = true;
+
+            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
                 // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0 == 0 && param.Kv % FmhaShape::kN1 == 0);
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
 
-                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                        kN0K1NeedPadding,
-                                                                        kK0N1NeedPadding,
+                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                        kPadSeqLenK,
+                                                                        kPadHeadDimQ,
+                                                                        kPadHeadDimV,
                                                                         has_attn_bias,
                                                                         false, // kStoreLSE
                                                                         occupancy>;
@@ -103,13 +104,13 @@ struct grouped_infer_causalmask_attnbias_dispatched
             else
             {
                 // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool k0n1_need_padding =
-                    !(param.K % FmhaShape::kK0BlockLength == 0 && param.Kv % FmhaShape::kN1 == 0);
+                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 
-                BOOL_SWITCH(k0n1_need_padding, kK0N1NeedPadding, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kM0NeedPadding,
-                                                                        kN0K1NeedPadding,
-                                                                        kK0N1NeedPadding,
+                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
+                                                                        kPadSeqLenK,
+                                                                        kPadHeadDimQ,
+                                                                        kPadHeadDimV,
                                                                         has_attn_bias,
                                                                         false, // kStoreLSE
                                                                         occupancy>;

From 9d2be4f6c7120a02f47a6fbfde33e96f0f9d1d35 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 30 Jan 2024 16:30:46 +0000
Subject: [PATCH 395/641] fix benchmark_attn_decoding

---
 xformers/benchmarks/benchmark_attn_decoding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index e56964d03..e1298592c 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -109,7 +109,7 @@ class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
 
 
 class AttentionDecodingCKSplitKV(AttentionDecodingFlashDecoding):
-    OP = xops.fmha.forward_splitk.FwOp
+    OP = xops.fmha.ck_splitk.FwOp
  
 
 class AttentionDecodingPyTorchRepeat(AttentionDecodingFlashDecoding):

From 7c3c766bca79f27eaab565ec25ba0061c64b5c6a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 30 Jan 2024 19:40:42 +0000
Subject: [PATCH 396/641] Remove third_party/composable_kernel_tiled

---
 third_party/composable_kernel_tiled | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 third_party/composable_kernel_tiled

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
deleted file mode 160000
index db28be6c6..000000000
--- a/third_party/composable_kernel_tiled
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit db28be6c69026f51630fa402f23464c4ffae463b

From 708c047c9a4eb1bf9c11bbfecf2bccfb4e687c4b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 30 Jan 2024 23:26:00 +0000
Subject: [PATCH 397/641] [Fix] use kK0BlockLength for HeadDim256 padding
 judging

---
 .../attention/hip_fmha/ck_tiled_fmha_batched_forward.h     | 7 +------
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h  | 7 +------
 .../attention/hip_fmha/ck_tiled_fmha_grouped_forward.h     | 7 +------
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h  | 7 +------
 4 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 2f15bb2c7..fd0f05b9d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -74,13 +74,11 @@ struct batched_forward_causalmask_attnbias_dispatched
 
             bool pad_seqlen_q  = !(param.M % FmhaShape::kM0 == 0);
             bool pad_seqlen_k  = !(param.N % FmhaShape::kN0 == 0);
+            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
             bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
-                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
-
                 BOOL_SWITCH_4(
                     pad_seqlen_q,
                     kPadSeqLenQ,
@@ -111,9 +109,6 @@ struct batched_forward_causalmask_attnbias_dispatched
             }
             else
             {
-                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-
                 BOOL_SWITCH_4(
                     pad_seqlen_q,
                     kPadSeqLenQ,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 526ef6205..d7af0af43 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -75,12 +75,10 @@ struct batched_infer_causalmask_attnbias_dispatched
             bool pad_seqlen_q  = !(param.M % FmhaShape::kM0 == 0);
             bool pad_seqlen_k  = !(param.N % FmhaShape::kN0 == 0);
             bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 
             if constexpr(HDim == 256)
             {
-                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
-
                 BOOL_SWITCH_4(
                     pad_seqlen_q,
                     kPadSeqLenQ,
@@ -111,9 +109,6 @@ struct batched_infer_causalmask_attnbias_dispatched
             }
             else
             {
-                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-
                 BOOL_SWITCH_4(
                     pad_seqlen_q,
                     kPadSeqLenQ,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 4b4eb602d..7b8707aa3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -75,13 +75,11 @@ struct grouped_forward_causalmask_attnbias_dispatched
             constexpr bool kPadSeqLenQ = true;
             constexpr bool kPadSeqLenK = true;
 
+            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
             bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
-                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
-
                 BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
                     using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
                                                                         kPadSeqLenK,
@@ -103,9 +101,6 @@ struct grouped_forward_causalmask_attnbias_dispatched
             }
             else
             {
-                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-
                 BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
                     using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
                                                                         kPadSeqLenK,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index ee7713317..31849f7b6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -75,13 +75,11 @@ struct grouped_infer_causalmask_attnbias_dispatched
             constexpr bool kPadSeqLenQ = true;
             constexpr bool kPadSeqLenK = true;
 
+            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
             bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
             if constexpr(HDim == 256)
             {
-                // BlockFmhaPipelineQSKSVS uses kQLoadOnce == false
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0 == 0);
-
                 BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
                     using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
                                                                         kPadSeqLenK,
@@ -103,9 +101,6 @@ struct grouped_infer_causalmask_attnbias_dispatched
             }
             else
             {
-                // BlockFmhaPipelineQRKSVS uses kQLoadOnce == true
-                bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-
                 BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
                     using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
                                                                         kPadSeqLenK,

From a0f264368b47ebabd3d6a2045f64be1b7cf68c5b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 31 Jan 2024 18:22:20 +0000
Subject: [PATCH 398/641] Tiny type change for custom_mask_type in param class

---
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index e518ccaaa..880434cf4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -34,7 +34,7 @@ struct BatchedInferParams
     const void* v_ptr;
     const void* attn_bias_ptr;
 
-    uint8_t custom_mask_type;
+    int custom_mask_type;
     int window_size; // local-attention
 
     void* out_ptr;
@@ -86,7 +86,7 @@ struct GroupedInferParams
     const void* v_ptr;
     const void* attn_bias_ptr;
 
-    uint8_t custom_mask_type;
+    int custom_mask_type;
     int window_size; // local-attention
 
     void* out_ptr;

From 96f3027d35d6218190b52979fb0eb3a489b18e6b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 1 Feb 2024 14:14:31 +0000
Subject: [PATCH 399/641] Change to use ROCm repo for ck-tiled submodule

---
 .gitmodules                         | 4 ++--
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 9ab802ac3..41a2922cb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,5 +10,5 @@
 	url = https://github.com/Dao-AILab/flash-attention.git
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
-	url = https://github.com/asroy/ck_tile.git
-	branch = fmha_attemp_async_copy_unify
+	url = https://github.com/ROCm/composable_kernel.git
+	branch = ck_tile/fmha_attemp_async_copy_unify
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 52b621ecf..eb53e235c 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 52b621ecf3533514031670dd99b6f2059832baaa
+Subproject commit eb53e235c76e3da0374214221e94c45419b90bec

From f3f2be4e547fc9fb1a43b26ac23c837b27a6fe58 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 1 Feb 2024 17:06:47 +0000
Subject: [PATCH 400/641] Remove tests/test_forward_ck_tiled.py

---
 tests/test_forward_ck_tiled.py | 2229 --------------------------------
 1 file changed, 2229 deletions(-)
 delete mode 100644 tests/test_forward_ck_tiled.py

diff --git a/tests/test_forward_ck_tiled.py b/tests/test_forward_ck_tiled.py
deleted file mode 100644
index 1484deaae..000000000
--- a/tests/test_forward_ck_tiled.py
+++ /dev/null
@@ -1,2229 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import random
-from functools import partial
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
-
-import pytest
-import torch
-import torch.nn.functional as F
-from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
-
-import xformers.ops
-from xformers.attn_bias_utils import create_attn_bias
-from xformers.ops import fmha
-from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
-from xformers.ops.fmha.common import AttentionOpBase
-from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
-
-from .utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-_types = [torch.float16, torch.bfloat16]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
-    fmha.ck.BwOp,
-]
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    for B in op._TEST_BATCH_SIZES:
-        for Mq in [32, 256]:
-            for Mkv in [32, 64, 256, 1024]:
-                for K in op._TEST_K:
-                    shapes.append((B, Mq, Mkv, 1, K, K))
-        Mq = 256
-        Mkv = 128
-        K = 32
-        H = 1
-        # Weird values of parameters
-        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
-            shapes.append((B, M, Mkv, H, K, K))
-            shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
-            if _K <= op.SUPPORTED_MAX_K:
-                shapes.append((B, Mq, Mkv, H, _K, _K))
-        # Different value for K / Kv
-        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-            for _K in [32, 36, 64, 256 + 8]:
-                shapes.append((B, Mq, Mkv, H, K, _K))
-                shapes.append((B, Mq, Mkv, H, _K, K))
-        # Exotic sizes
-        for K in op._TEST_K:
-            shapes.append((B, 16, 1024, H, K, K))
-            shapes.append((B, 1024, 16, H, K, K))
-        # Some number of heads
-        for H in [3, 5, 12]:
-            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
-    # Filter-out not supported shapes
-    shapes = [
-        shape
-        for shape in shapes
-        if len(
-            op.shape_not_supported_reasons(
-                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
-            )
-        )
-        == 0
-    ]
-    # Add some random shapes
-    if op in [
-        fmha.cutlass.FwOp,
-        fmha.cutlass.BwOp,
-        fmha.flash.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        found_count = 0
-        while found_count < 200:
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
-                continue
-            found_count += 1
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def make_id(op, device, dtype, bias_type, *shape):
-    return (
-        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-        f"-{'-'.join([str(s) for s in shape])}"
-    )
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if bias_type in {
-                            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-                            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
-                        }:
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-    return {
-        "argvalues": combination,
-        "ids": [make_id(*c) for c in combination],
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
-)
-
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 5:
-
-        def attn_bias_group(group: int):
-            if isinstance(attn_bias, torch.Tensor):
-                return attn_bias[:, group]
-            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    attn_bias._bias[:, group]
-                )
-            return attn_bias
-
-        return torch.stack(
-            [
-                ref_attention_bmhk(
-                    q[:, :, g],
-                    k[:, :, g],
-                    v[:, :, g],
-                    scale=scale,
-                    attn_bias=attn_bias_group(g),
-                )
-                for g in range(q.shape[2])
-            ],
-            dim=2,
-        )
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
-    tensor_with_grad: Optional[torch.Tensor] = None
-    if isinstance(attn_bias, torch.Tensor):
-        tensor_with_grad = attn_bias
-    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-        tensor_with_grad = attn_bias._bias
-    if tensor_with_grad is not None:
-        grad = tensor_with_grad.grad
-        if clear:
-            tensor_with_grad.grad = None
-        return grad
-    return None
-
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-    g: int = 1,
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-
-    mask_is_bottom_right = attn_bias_type is not None and issubclass(
-        attn_bias_type,
-        (
-            fmha.attn_bias.LowerTriangularFromBottomRightMask,
-            fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask,
-            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
-            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
-            fmha.attn_bias.LocalAttentionFromBottomRightMask,
-        ),
-    )
-    if mask_is_bottom_right and q_len > kv_len:
-        # Bottom-right attention and local-attention masks require q_len <= kv_len
-        kv_len = q_len
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype)
-    elif fmt == "BMHK":
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype)
-    else:
-        assert fmt == "BMGHK"
-        query = torch.randn((B, q_len, g, h, k), device=device, dtype=dtype)
-        key = torch.randn((B, kv_len, g, 1, k), device=device, dtype=dtype)
-        value = torch.randn((B, kv_len, g, 1, kv), device=device, dtype=dtype)
-
-    for x in [query, key, value]:
-        x.mul_(scale)
-
-    if fmt == "BMGHK":
-        # Expand - after the in-place mul
-        key = key.expand((B, kv_len, g, h, k))
-        value = value.expand((B, kv_len, g, h, k))
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            num_heads_groups=g,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("packed", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if k > 256 or kv > 256:
-        pytest.skip("head-dim size bigger than 256 is not supported by CK-FlashAttention")
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK" if packed else fmt,
-        **kwargs,
-    )
-
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                num_heads_groups=1,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        elif fmt == "BMHK":
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-        else:
-            assert False, f"Unsupport fmt {fmt} with packing"
-        assert not query.is_contiguous()
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("k_len", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 4])
-@pytest.mark.parametrize("kv_len", [128, 512])
-@pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("dtype", _types)
-def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
-    device = "cuda"
-    scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
-    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
-    # this should be equivalent to the average over value
-    ref = value.mean(1, keepdim=True).expand_as(query)
-
-    if dtype is torch.float16:
-        assert_allclose(out, ref, atol=1e-5)
-    else:
-        assert_allclose(out, ref, atol=1e-2)
-
-def _block_diag_reshape_lse(
-    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
-) -> torch.Tensor:
-    """LSE can be padded, let's remove the padding"""
-    parts = []
-    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
-        parts.append(slice[:, : end - start])
-    return torch.cat(parts, dim=1).unsqueeze(1)
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
-    )
-
-    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query,
-        key,
-        value,
-        op=op,
-        attn_bias=attn_bias,
-    )
-    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            tensor_bias = attn_bias.materialize(
-                (query.shape[0], 1, query.shape[1], key.shape[1]),
-                device=query.device,
-                dtype=torch.float32,
-            )
-        else:
-            assert isinstance(attn_bias, torch.Tensor)
-            tensor_bias = attn_bias
-        if tensor_bias.ndim == 4:
-            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
-        attn = attn + tensor_bias.float()
-    ref_lse = attn.logsumexp(-1)
-    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
-        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
-    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
-
-
-@cuda_only
-@pytest.mark.parametrize("op", [fmha.cutlass.FwOp, fmha.flash.FwOp])
-def test_logsumexp_mqa(op):
-    if not op.is_available():
-        pytest.skip("not available")
-
-    dtype = torch.float16
-    s = 3
-    query = torch.randn([1, 1, 32, 128], dtype=dtype, device="cuda") * s
-    key = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
-        -1, -1, 32, -1
-    )
-    value = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
-        -1, -1, 32, -1
-    )
-    assert key.stride(2) == 0
-
-    _, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query,
-        key,
-        value,
-        op=op,
-    )
-    query, key, value = [x[0].transpose(0, 1) for x in [query, key, value]]
-    attn = (query.float() / query.shape[-1] ** 0.5) @ key.float().transpose(-2, -1)
-    ref_lse = attn.logsumexp(-1)
-    assert_allclose(lse[0, :, 0], ref_lse[:, 0], atol=2e-4)
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("grad_out_contiguous", [False, True])
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_backward(
-    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    grad_out_contiguous,
-    fmt,
-):
-    (
-        op_bw,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    ## ToDo: reopen bfloat16 for testing
-    if dtype is torch.bfloat16:
-        pytest.skip("Temporarily disabled bfloat16 as we are still improving the accuracy of the results")
-
-    if k > 128 or kv > 128:
-        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
-
-    if k % 2 != 0:
-        pytest.skip("head-dim length must be an even value for CK-FlashAttention")    
-
-    if grad_out_contiguous is False:
-        pytest.skip("CK-FlashAttention requires grad_out and out have same lengths/strides")
-
-    attn_bias_requires_grad = (
-        random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
-    )
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        attn_bias_requires_grad=attn_bias_requires_grad,
-        fmt=fmt,
-    )
-
-    # To understand why we do this, check the comment on the
-    # `AttentionBwOpBase` class
-    scale = None
-    if op_bw.SUPPORTS_CUSTOM_SCALE and query.shape[-1] < 32:
-        scale = (1 / 32) ** 0.5
-    op_fw = (
-        sample_random_supported_fw(
-            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
-            seed=q_len * kv + kv_len * k,
-        )
-        if op_bw != fmha.ck.BwOp
-        else fmha.ck.FwOp
-    )
-    qkv = None
-
-    if (
-        fmt == "BMHK"
-        and query.shape[3] == value.shape[3]
-        and query.shape[1] == value.shape[1]
-    ):
-        qkv = torch.stack([query, key, value], 2)
-        qkv.requires_grad_(True)
-        # bm3hk -> 3 x bmhk
-        query, key, value = xformers.ops.unbind(qkv, 2)
-        assert not query.is_contiguous()
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
-        pytest.skip("inputs not supported")
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, scale=scale, op=(op_fw, op_bw)
-    )
-
-    grad_out = torch.randn_like(out)
-    if grad_out_contiguous is False:
-        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
-            None, None, :
-        ].expand_as(out)
-
-    out.backward(grad_out)
-
-    if qkv is None and op_bw == fmha.cutlass.BwOp:
-        assert query.stride() == query.grad.stride()
-
-    grads = []
-    if qkv is None:
-        grads = [query.grad, key.grad, value.grad]
-        query.grad = None
-        key.grad = None
-        value.grad = None
-    else:
-        grads = [qkv.grad]
-        qkv.grad = None
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
-        if attn_bias_grad is not None:
-            grads.append(attn_bias_grad)
-
-    ref = ref_attention(query, key, value, attn_bias, scale=scale)
-    ref.backward(grad_out)
-
-    assert_allclose(
-        out.float(),
-        ref.float(),
-        "fw pass",
-        atol=op_fw.ERROR_ATOL[dtype],
-        rtol=op_fw.ERROR_RTOL[dtype],
-    )
-
-    del out
-    del grad_out
-    del ref
-
-    atol = op_bw.ERROR_ATOL[dtype]
-    rtol = op_bw.ERROR_RTOL[dtype]
-
-    grads_ref = []
-    grads_name = []
-    if qkv is None:
-        assert isinstance(query.grad, torch.Tensor)
-        assert isinstance(key.grad, torch.Tensor)
-        assert isinstance(value.grad, torch.Tensor)
-        grads_ref = [query.grad, key.grad, value.grad]
-        grads_name = ["query", "key", "value"]
-    else:
-        assert isinstance(qkv.grad, torch.Tensor)
-        grads_ref = [qkv.grad]
-        grads_name = ["qkv"]
-
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias)
-        if attn_bias_grad is not None:
-            grads_ref.append(attn_bias.grad)
-            grads_name.append("bias")
-
-    del query
-    del key
-    del value
-    del qkv
-
-    assert len(grads_ref) == len(
-        grads
-    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
-    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
-        assert_allclose(
-            calc_grad,
-            ref_grad,
-            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
-            atol=atol,
-            rtol=rtol,
-        )
-
-
-def _vec_binom_test(x, n, p):
-    """
-    vectorized implementation of scipy.stats.binom_test
-    this makes our tests much faster
-    reference: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_morestats.py#L2609-L2702
-    """
-    import numpy as np
-    from scipy.stats import distributions
-
-    x = np.atleast_1d(x)
-    d = distributions.binom.pmf(x, n, p)[:, None]
-    rerr = 1 + 1e-7
-    # x < p * n case
-    i = np.arange(np.ceil(p * n), n + 1)
-    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
-    pval1 = distributions.binom.cdf(x, n, p) + distributions.binom.sf(n - y, n, p)
-
-    # other case
-    i = np.arange(np.floor(p * n) + 1)
-    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
-    pval2 = distributions.binom.cdf(y - 1, n, p) + distributions.binom.sf(x - 1, n, p)
-
-    pval = np.where(x < p * n, pval1, pval2)
-    pval = np.minimum(1.0, pval)
-    return pval
-
-def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
-    if op == fmha.ck.FwOp:
-        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
-        ## rand_uniform is an int32 tensor
-        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
-        ##mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
-        mask = (rand_uniform <= int((1.0-p)*255.0)).to(torch.float32)
-        mask = mask.reshape(batch_size, q_len, kv_len)
-    else:
-        mask = torch.empty((batch_size, q_len, kv_len), device=device)
-        mask = torch.ops.xformers._temp_dropout(mask, p)
-
-    return mask
-
-@cuda_only
-@pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
-@pytest.mark.parametrize("seed", [42, 124])
-@pytest.mark.parametrize("p", [0.3, 0.7])
-@pytest.mark.parametrize("k_len", [32])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
-@pytest.mark.parametrize("q_len", [2, 33])
-@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
-def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
-    device = "cuda"
-    scale = 0.05 
-    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
-    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-
-    inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
-    if not op.supports(inputs_for_support_check):
-        del query, key, value, attn_bias
-        pytest.skip(f"{op.NAME}: unsupported input")
-
-    torch.manual_seed(seed)
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, p, op=(op, None)
-    )
-
-    torch.manual_seed(seed)
-    out2 = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, p, op=(op, None)
-    )
-
-    assert_allclose(out, out2, "dropout reproducibility")
-
-    torch.manual_seed(seed)
-    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
-    ref = ref_attention(query, key, value, attn_bias, mask, p)
-    assert_allclose(out.float(), ref, atol=3e-3, rtol=5e-4), f"{(out - ref).abs().max()}"
-
-    num_trials = 1000
-    p_val_tol = 1e-6
-    keep_prob = 1 - p
-    masks = []
-    for i in range(num_trials):
-        mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
-        masks.append(mask.clone().cpu())
-    masks = torch.stack(masks, dim=0)
-    p_value = binomtest(int(masks.sum()), masks.numel(), p=keep_prob).pvalue
-    assert p_value > p_val_tol, p_value
-    masks = masks.sum(0).flatten()
-    p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
-    assert all(p_values > p_val_tol)
-
-
-def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
-    if dtype is torch.bfloat16 and compute_capability < (8, 0):
-        pytest.skip("bf16 requires Sm80")
-    if not op.is_available():
-        pytest.skip()
-
-    scale = 3 
-    device = "cuda"
-    query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
-    key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
-    value = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    grad_out = torch.ones_like(query)
-
-    assert op.supports(fmha.Inputs(query=query, key=key, value=value, p=p))
-
-    seed = 42
-    torch.manual_seed(seed)
-    out = xformers.ops.memory_efficient_attention(query, key, value, p=p, op=(op, None))
-
-    out.backward(grad_out)
-
-    grad_q = query.grad
-    grad_k = key.grad
-    grad_v = value.grad
-
-    query.grad = None
-    key.grad = None
-    value.grad = None
-
-    torch.manual_seed(seed)
-    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
-
-    ref = ref_attention(query, key, value, None, mask, p)
-    ref.backward(grad_out)
-
-    atol, rtol = (
-        fmha.AttentionBwOpBase.ERROR_ATOL[dtype],
-        fmha.AttentionBwOpBase.ERROR_RTOL[dtype],
-    )
-    assert_allclose(
-        grad_v,
-        value.grad,
-        "grad_v",
-        atol=atol,
-        rtol=rtol,
-    )
-    # TODO: Investigate why precision is worse
-    if dtype in [torch.float16, torch.bfloat16]:
-        atol = atol * 2 + 0.15
-        rtol = rtol * 2
-    assert_allclose(
-        grad_q,
-        query.grad,
-        "grad_q",
-        atol=atol,
-        rtol=rtol,
-    )
-    assert_allclose(
-        grad_k,
-        key.grad,
-        "grad_k",
-        atol=atol,
-        rtol=rtol,
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("p", [0.3, 0.7])
-@pytest.mark.parametrize("k", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
-@pytest.mark.parametrize("q_len", [2, 33])
-def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
-    _test_dropout_backward(
-        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
-@pytest.mark.parametrize("k", [16, 128, 256])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 248, 256])
-@pytest.mark.parametrize("q_len", [3, 248, 256])
-@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
-def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
-    _test_dropout_backward(
-        q_len,
-        kv_len,
-        batch_size,
-        k,
-        p,
-        op=fmha.cutlass.FwOp,
-        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("k_len", [32])
-@pytest.mark.parametrize("batch_size", [1])
-@pytest.mark.parametrize("kv_len", [3 * 32])
-@pytest.mark.parametrize("q_len", [3 * 32])
-def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size, k_len):
-    device = "cuda"
-    op_fw = fmha.small_k.FwOp
-    op_bw = fmha.small_k.BwOp
-
-    scale = 3
-    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
-    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-
-    # in this case, most of the blocks in a row get masked
-    attn_bias = torch.full((3, 32), float("-inf"), device=device)
-    attn_bias[:2, :4] = 0
-    attn_bias = attn_bias.flatten()[None, None, :].expand(1, q_len, -1)
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, op=(op_fw, op_bw)
-    )
-    ref = ref_attention(query, key, value, attn_bias)
-
-    assert_allclose(
-        out, ref, atol=op_fw.ERROR_ATOL[query.dtype], rtol=op_fw.ERROR_RTOL[query.dtype]
-    )
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    grad_out = torch.ones_like(query)
-
-    out = xformers.ops.memory_efficient_attention(query, key, value, attn_bias)
-    out.backward(grad_out)
-
-    grad_q = query.grad
-    grad_k = key.grad
-    grad_v = value.grad
-
-    query.grad = None
-    key.grad = None
-    value.grad = None
-
-    ref = ref_attention(query, key, value, attn_bias)
-    ref.backward(grad_out)
-
-    atol = op_bw.ERROR_ATOL[query.dtype]
-    rtol = op_bw.ERROR_RTOL[query.dtype]
-    assert_allclose(grad_q, query.grad, "grad_q", atol=atol, rtol=rtol)
-    assert_allclose(grad_k, key.grad, "grad_k", atol=atol, rtol=rtol)
-    assert_allclose(grad_v, value.grad, "grad_v", atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt=fmt
-    )
-    grad_out = torch.ones_like(query)
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query, key, value, attn_bias
-    )
-    assert out.ndim == query.ndim
-    dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
-        grad_out, out, lse, query, key, value, attn_bias
-    )
-    assert dq.shape == query.shape
-    assert dk.shape == key.shape
-    assert dv.shape == value.shape
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_cuda_streams(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    if device != "cuda":
-        pytest.skip("Not CUDA")
-    bias_type = None
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ]
-    s_hipri = torch.cuda.Stream(priority=-1)
-    s_lopri = torch.cuda.Stream(priority=0)
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK"
-    )
-    torch.cuda.synchronize()
-    with torch.cuda.stream(s_lopri):
-        torch.cuda._sleep(100_000_000)  # wait 100m cycles
-        query *= 2
-    s_hipri.wait_stream(s_lopri)
-    with torch.cuda.stream(s_hipri):
-        # If the kernel is scheduled in the main stream
-        # `query * 2` has not been executed yet
-        out = xformers.ops.memory_efficient_attention(query, key, value, op=(op, None))
-    # Test that `s_lopri` is still sleeping
-    # and that `query *= 2` has not been executed yet
-    query2_main_stream = query * 2
-    torch.cuda.synchronize()
-    # TODO: Figure out why this is failing sometimes
-    # The sleep timer seems to be high enough already ...
-    # assert torch.allclose(query2_main_stream, query), "Need to increase sleep time"
-    del query2_main_stream
-
-    ref = ref_attention(query, key, value)
-    assert out.shape == ref.shape, out.shape
-
-    assert_allclose(
-        out.float(),
-        ref.float(),
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
-    p = 0.0
-    scale = 0.1
-
-    (
-        op_bw,
-        device,
-        dtype,
-        _,
-        B,
-        q_len,
-        kv_len,
-        H,
-        k,
-        Kv,
-    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    torch.manual_seed(q_len + kv_len + k)
-    if device != "cuda":
-        pytest.skip("Not CUDA")
-
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
-    )
-    inputs = fmha.Inputs(
-        query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
-    )
-    op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
-    grad_out = query.new_ones(B * H, q_len, Kv)
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    reasons = op_fw.not_supported_reasons(inputs)
-    if reasons:
-        pytest.skip(f"{op_fw.NAME}: unsupported ({'/'.join(reasons)})")
-    reasons = op_bw.not_supported_reasons(inputs)
-    if reasons:
-        pytest.skip(f"{op_bw.NAME}: unsupported ({'/'.join(reasons)})")
-
-    # NOTE: we still need to scale the inputs to not blowup
-    # the pre-softmax values (numerical stability)
-    s = k**-0.5
-    out = xformers.ops.memory_efficient_attention(
-        query * s, key, value, attn_bias, p, scale, op=(op_fw, op_bw)
-    )
-    out.backward(grad_out)
-    grad_q, grad_k, grad_v = query.grad, key.grad, value.grad
-    query.grad = key.grad = value.grad = None
-
-    ref = ref_attention(query * s, key, value, attn_bias, None, p, scale)
-    ref.backward(grad_out)
-    ref_grad_q, ref_grad_k, ref_grad_v = query.grad, key.grad, value.grad
-    query.grad = key.grad = value.grad = None
-
-    atol = op_fw.ERROR_ATOL[dtype]
-    rtol = op_fw.ERROR_RTOL[dtype]
-    assert_allclose(out.float(), ref.float(), "out", atol=atol, rtol=rtol)
-    atol = op_bw.ERROR_ATOL[dtype]
-    rtol = op_bw.ERROR_RTOL[dtype]
-    assert_allclose(grad_q, ref_grad_q, "grad_q", atol=atol, rtol=rtol)
-    assert_allclose(grad_k, ref_grad_k, "grad_k", atol=atol, rtol=rtol)
-    assert_allclose(grad_v, ref_grad_v, "grad_v", atol=atol, rtol=rtol)
-
-
-def apply_attention(query, key, value, attn_bias, op_fw, proj):
-    x = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias=attn_bias, op=(op_fw, None)
-    )
-    x = proj(x)
-    return x
-
-
-@pytest.mark.parametrize("use_reentrant", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_grad_checkpointing(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    use_reentrant,
-):
-    fmt = "BMHK"
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    bias_type = None
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    )
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt=fmt,
-    )
-    qkv = None
-
-    if (
-        fmt == "BMHK"
-        and query.shape[3] == value.shape[3]
-        and query.shape[1] == value.shape[1]
-    ):
-        qkv = torch.stack([query, key, value], 2)
-        qkv.requires_grad_(True)
-        # bm3hk -> 3 x bmhk
-        query, key, value = xformers.ops.unbind(qkv, 2)
-        assert not query.is_contiguous()
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    proj = torch.nn.Linear(kv, k, device=device, dtype=dtype)
-
-    x = query
-    for _ in range(5):
-        x = checkpoint(
-            apply_attention,
-            x,
-            key,
-            value,
-            attn_bias,
-            op,
-            proj,
-            use_reentrant=use_reentrant,
-        )
-    x.mean().backward()
-
-
-ALL_FW_OPS_NO_SMALLK = [op for op in ALL_FW_OPS if op is not fmha.small_k.FwOp]
-
-
-@pytest.mark.parametrize(
-    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
-)
-def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 1, 1, 32])
-    with pytest.raises(ValueError):
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
-)
-def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
-        0, 3, 1, 2
-    )
-    try:
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-    except ValueError as e:
-        if "Only work on pre-MLIR triton for now" in str(e):
-            pytest.skip("Only work on pre-MLIR triton for now")
-        q = q.contiguous()
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
-)
-def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
-    try:
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-    except ValueError as e:
-        if "Only work on pre-MLIR triton for now" in str(e):
-            pytest.skip("Only work on pre-MLIR triton for now")
-        q = q.contiguous()
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-
-def test_attn_bias_causal() -> None:
-    m = -math.inf
-    causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
-    tensor_bias = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
-
-    attn_bias = fmha.attn_bias.LowerTriangularMask()
-    assert_allclose(attn_bias.materialize(causal_mask.shape), causal_mask, "causal")
-    attn_bias = attn_bias.add_bias(tensor_bias)
-    assert_allclose(
-        attn_bias.materialize(causal_mask.shape),
-        tensor_bias + causal_mask,
-        "causal+tensor_bias",
-    )
-
-
-def test_attn_bias_torch_tensor() -> None:
-    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
-    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
-    m = -math.inf
-    causal_bias = torch.tensor([[0, m, m], [0, 0, m]])
-    assert_allclose(
-        attn_bias.materialize((2, 3)), causal_bias + tensor_bias, "tensor_bias+causal"
-    )
-
-
-def test_attn_bias_blockdiag() -> None:
-    queries = [
-        torch.randn([1, 3, 1, 8]),
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([1, 5, 1, 8]),
-    ]
-    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
-
-    # Verify mask
-    as_tensor = attn_bias.materialize((10, 10))
-    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 2 * 2 + 5 * 5
-    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
-    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1")
-    assert_allclose(as_tensor[5:, 5:], torch.zeros([5, 5]), "batch2")
-
-    # Verify we can split it back
-    queries2 = attn_bias.split(q)
-    assert len(queries) == len(queries2)
-    for q1, q2 in zip(queries, queries2):
-        assert_allclose(q1, q2)
-
-
-def test_attn_bias_blockdiag_batched() -> None:
-    queries = [
-        torch.randn([1, 3, 1, 8]),
-        torch.randn([3, 2, 1, 8]),
-        torch.randn([1, 5, 1, 8]),
-    ]
-    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
-
-    # Verify mask
-    as_tensor = attn_bias.materialize((14, 14))
-    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 3 * 2 * 2 + 5 * 5
-    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
-    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1.0")
-    assert_allclose(as_tensor[5:7, 5:7], torch.zeros([2, 2]), "batch1.1")
-    assert_allclose(as_tensor[7:9, 7:9], torch.zeros([2, 2]), "batch1.2")
-    assert_allclose(as_tensor[9:, 9:], torch.zeros([5, 5]), "batch2")
-
-    # Verify we can split it back
-    queries2 = attn_bias.split(q)
-    assert len(queries) == len(queries2)
-    for q1, q2 in zip(queries, queries2):
-        assert_allclose(q1, q2)
-
-
-def test_attn_bias_blockdiag_crossattn_causal() -> None:
-    # Q / KV have different seqlen
-    list_q = [
-        torch.randn([1, 3, 1, 8]),
-        torch.randn([2, 1, 1, 8]),
-    ]
-    list_k = [
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([2, 3, 1, 8]),
-    ]
-
-    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
-        list_q, list_k
-    )
-
-    # Verify mask
-    as_tensor = attn_bias.materialize((q.shape[1], k.shape[1]))
-    assert int((as_tensor != -math.inf).sum().item()) == 3 * 2 + 2 * 3 * 1
-    assert_allclose(as_tensor[0:3, 0:2], torch.zeros([3, 2]), "batch0")
-    assert_allclose(as_tensor[3:4, 2:5], torch.zeros([1, 3]), "batch1.0")
-    assert_allclose(as_tensor[4:, 5:], torch.zeros([1, 3]), "batch1.1")
-
-    # Also test causal version
-    as_tensor = attn_bias.make_causal().materialize((q.shape[1], k.shape[1]))
-    assert_allclose(
-        as_tensor[3:4, 2:5],
-        fmha.attn_bias.LowerTriangularMask().materialize((1, 3)),
-        "batch1.0[causal]",
-    )
-
-    # Verify we can split it back
-    list_q2 = attn_bias.split_queries(q)
-    assert len(list_q) == len(list_q2)
-    for q1, q2 in zip(list_q, list_q2):
-        assert_allclose(q1, q2)
-    with pytest.raises(ValueError):
-        attn_bias.split_queries(k)
-    list_k2 = attn_bias.split_kv(k)
-    assert len(list_k) == len(list_k2)
-    for k1, k2 in zip(list_k, list_k2):
-        assert_allclose(k1, k2)
-
-
-def test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond() -> None:
-    list_q = [
-        torch.randn([1, 3, 1, 8]),
-    ]
-    list_k = [
-        torch.randn([1, 2, 1, 8]),
-    ]
-    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
-        list_q, list_k
-    )
-    with pytest.raises(ValueError):
-        attn_bias.make_causal_from_bottomright()
-
-
-def test_attn_bias_blockdiag_crossattn_causal_with_prefix() -> None:
-    # Q / KV have different seqlen
-    list_q = [
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([2, 2, 1, 8]),
-    ]
-    list_k = [
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([2, 5, 1, 8]),
-    ]
-
-    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
-        list_q, list_k
-    )
-    as_tensor = attn_bias.make_causal_from_bottomright().materialize(
-        (q.shape[1], k.shape[1])
-    )
-    m = -math.inf
-    assert_allclose(
-        as_tensor[0:2, 0:2],
-        torch.tensor([[0, m], [0, 0]], dtype=torch.float32),
-        "batch1.1[causal_with_prefix]",
-    )
-    assert_allclose(
-        as_tensor[2:4, 2:7],
-        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
-        "batch2.1[causal_with_prefix]",
-    )
-    assert_allclose(
-        as_tensor[4:6, 7:12],
-        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
-        "batch2.2[causal_with_prefix]",
-    )
-
-
-@cuda_only
-def test_attn_bias_padded() -> None:
-    bsize, n_heads, d, padding = 8, 3, 8, 32
-
-    # Q / KV have different seqlen
-    k = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
-    k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
-    other = bsize - 1
-    v = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
-    n_q_first = 4
-    q = [
-        torch.randn((1, n_q_first, n_heads, d), device="cuda", dtype=torch.float16),
-        torch.randn((1, other, n_heads, d), device="cuda", dtype=torch.float16),
-    ]
-    q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
-    q_seqlen = [n_q_first] + [1] * other
-
-    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=q_seqlen,
-        kv_seqlen=k_seqlen,
-        kv_padding=padding,
-    )
-
-    v = v.view(1, -1, n_heads, d)
-    k = k.view(1, -1, n_heads, d)
-
-    scores = (q_cat.transpose(1, 2) @ k.transpose(1, 2).transpose(2, 3)).float()
-    assert not scores.isnan().any()
-    mask = torch.full_like(scores, -float("inf"))
-    for i, (slen, qlen) in enumerate(zip(k_seqlen, q_seqlen)):
-        kseq_start = i * padding
-        qstart = sum(q_seqlen[:i])
-        mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen] = torch.triu(
-            mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen].float(),
-            diagonal=1 + slen - qlen,
-        ).float()
-
-    scores += mask
-    assert not scores.isnan().any()
-    # 1,3,10,8 @ 1,3,8,256 -> 1,3,10,256
-    scores = torch.nn.functional.softmax(scores, -1).half()
-    # torch.Size([1, 3, 3, 32]) @ torch.Size([1, 3, 32, 8])
-    output = scores @ v.transpose(1, 2)  # 1,3,10,256 @ 1,3,256, 8 -> 1,3,10,8
-    output = output.transpose(1, 2).contiguous()
-
-    fmha_output = fmha.memory_efficient_attention_forward(
-        q_cat, k, v, attn_bias, scale=1.0, op=fmha.ck.FwOp
-    )
-
-    # assert torch.allclose(output, fmha_output)
-    assert_allclose(
-        output,
-        fmha_output,
-        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
-        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
-    )
-
-
-def _kv_heads_label(kv_heads: Optional[int]) -> str:
-    if kv_heads is None:
-        return ""
-    if kv_heads == 1:
-        return "mq"
-    return f"gqa{kv_heads}"
-
-@pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
-@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
-@pytest.mark.parametrize("padding", [32, 4096])
-@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
-def test_decoder(
-    op,
-    n_heads: int,
-    kv_heads: Optional[int],
-    padding: int,
-    bsz: int,
-    dtype: str,
-    dequant: bool = False,
-    num_queries: int = 1,
-    d = 256,
-) -> None:
-    # kv_heads = 1: multiquery
-    # kv_heads = None: neither MQA nor GQA
-    # kv_heads > 1: BMGHK
-    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
-    tensor_options = {"dtype": dtype_, "device": "cuda"}
-    torch.manual_seed(1)
-    num_queries = 1
-    if kv_heads is not None and kv_heads > 1:
-        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
-        q_shape: Tuple[int, ...] = (
-            1,
-            bsz * num_queries,
-            kv_heads,
-            n_heads,
-            d,
-        )
-    else:
-        k_shape = (1, bsz * padding, n_heads, d)
-        q_shape = (1, bsz * num_queries, n_heads, d)
-
-    k = torch.randn(k_shape, **tensor_options)
-    k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
-    v = torch.randn_like(k)
-    q = torch.randn(q_shape, **tensor_options)
-    causal_diagonal = torch.tensor(  # TODO: make unnecessary
-        [i - 1 for i in k_seqlen], dtype=torch.int32
-    ).cuda()
-
-    if kv_heads is not None:
-        k = k[..., :1, :].expand(k_shape)
-        v = v[..., :1, :].expand(k_shape)
-
-    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[num_queries] * bsz,
-        kv_seqlen=k_seqlen,
-        causal_diagonal=causal_diagonal,
-        kv_padding=padding,
-    )
-    inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
-    if (not_supported_reasons := op.not_supported_reasons(inp)):
-        pytest.skip(f"{not_supported_reasons=}")
-
-    decoder_output = fmha.memory_efficient_attention_forward(
-        q, k, v, attn_bias, op=op
-    )
-
-    ref_output = ref_attention(q, k, v, attn_bias)
-
-    assert_allclose(
-        decoder_output.float(),
-        ref_output,
-        atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
-        rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
-    )
-
-def test_attn_bias_from_seqlens() -> None:
-    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
-    out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
-    assert len(out) == 3
-    assert tuple(out[0].shape) == (1, 3, 16)
-
-
-@cuda_only
-def test_attn_bias_blockdiag_doc() -> None:
-    """IMPORTANT:
-    This is the example in the doc for `BlockDiagonalMask`.
-    If this example needs to be updated, please also update the doc
-    """
-    import torch
-
-    from xformers.ops import fmha
-
-    K = 16
-    dtype = torch.float16
-    device = "cuda"
-    list_x = [
-        torch.randn([1, 3, 1, K], dtype=dtype, device=device),
-        torch.randn([1, 6, 1, K], dtype=dtype, device=device),
-        torch.randn([1, 2, 1, K], dtype=dtype, device=device),
-    ]
-    attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
-
-    linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
-
-    q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
-    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None))
-    list_out = attn_bias.split(out)
-    assert tuple(list_out[0].shape) == (1, 3, 1, K)
-
-
-@cuda_only
-class TestAttnBias:
-    @staticmethod
-    def create_tensors(
-        dtype,
-        B: int = 2,
-        Mq: int = 32,
-        Mkv: int = 32,
-        H: int = 3,
-        K: int = 16,
-        Kv: int = 16,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        return (
-            torch.randn([B, Mq, H, K], device="cuda", dtype=dtype) * 3,
-            torch.randn([B, Mkv, H, K], device="cuda", dtype=dtype) * 3,
-            torch.randn([B, Mkv, H, Kv], device="cuda", dtype=dtype) * 3,
-            torch.randn([B, H, Mq, Mkv], device="cuda", dtype=dtype) * 3,
-        )
-
-    @staticmethod
-    def pad_bias(bias: torch.Tensor) -> torch.Tensor:
-        align_to = 16
-        if (bias.shape[-1] % align_to) == 0:
-            return bias
-        pad_count = align_to - (bias.shape[-1] % align_to)
-        return torch.nn.functional.pad(bias, [0, pad_count])[:, :, :, : bias.shape[-1]]
-
-    def test_f16_biasf32(self) -> None:
-        q, k, v, bias = self.create_tensors(torch.float16)
-        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-        bias = bias.to(torch.float32)
-        with pytest.raises((ValueError, RuntimeError)):
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-
-    def test_f32_biasf16(self) -> None:
-        q, k, v, bias = self.create_tensors(torch.float32)
-        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-        bias = bias.to(torch.float16)
-        with pytest.raises((ValueError, RuntimeError)):
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
-    def test_wrong_alignment(self, dtype) -> None:
-        op = fmha.cutlass.FwOp
-        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
-        try:
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
-            return
-        except (ValueError, RuntimeError):
-            pass
-        # This case is not supported, likely due to padding issues
-        # Let's make sure it works with padding
-        assert bias.ndim == 4, bias.shape
-        bias_padded = self.pad_bias(bias)
-        out = fmha.memory_efficient_attention(
-            q, k, v, attn_bias=bias_padded, op=(op, None)
-        ).float()
-        ref_out = ref_attention_bmhk(q, k, v, bias)
-        assert_allclose(
-            out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
-        )
-
-    def test_permuted_attn_bias(self) -> None:
-        op = fmha.cutlass.FwOp
-        dtype = torch.float16
-        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
-        bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
-        # Either it works, or it raises an exception
-        # but we should never get a CUDA error
-        try:
-            out = fmha.memory_efficient_attention(
-                q, k, v, attn_bias=bias, op=(op, None)
-            ).float()
-            ref_out = ref_attention_bmhk(q, k, v, bias)
-            assert_allclose(
-                out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
-            )
-        except (ValueError, RuntimeError):
-            pass
-
-
-SM_AND_SHMEM_KBYTES = [
-    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
-    (50, 64),
-    (60, 64),
-    (70, 96),
-    (75, 64),
-    (80, 163),
-    (86, 99),
-    (89, 99),
-    # (90, 227),
-]
-
-
-@cuda_only
-@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
-@pytest.mark.parametrize(
-    "sm_shmem",
-    SM_AND_SHMEM_KBYTES,
-    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
-)
-def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
-    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
-    sm, shmem_kbytes = sm_shmem
-    if sm < 80 and dtype_str == "bf16":
-        return
-
-    for k in [16, 32, 64, 128, 256]:
-        assert torch.ops.xformers._has_cutlassF_kernel_for(
-            dtype, sm, shmem_kbytes * 1024, k
-        ), f"k={k}"
-        assert torch.ops.xformers._has_cutlassB_kernel_for(
-            dtype, sm, shmem_kbytes * 1024, k
-        ), f"k={k}"
-
-
-def test_window_size_materialize() -> None:
-    seqlens = [4, 6]
-    attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-        q_seqlen=seqlens,
-        kv_seqlen=seqlens,
-    ).make_local_attention(2)
-    mask = attn_bias.materialize(
-        (1, 1, sum(seqlens), sum(seqlens)),
-        device="cpu",
-        dtype=torch.float32,
-    )
-    true_mask = torch.log(
-        torch.Tensor(
-            [
-                [
-                    [
-                        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
-                    ]
-                ]
-            ]
-        )
-    )
-    assert torch.all(mask == true_mask)
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "opFW_biasT",
-    [
-        (op, biasT)
-        for op in ALL_FW_OPS
-        for biasT in op.SUPPORTED_ATTN_BIAS_TYPES
-        if op.SUPPORTS_BMGHK
-    ],
-)
-def test_forward_gqa(opFW_biasT):
-    opFW, biasT = opFW_biasT
-    B_Mq_Mkv_H_K_Kv = (3, 512, 512, 16, 128, 128)
-    test_forward(
-        (
-            opFW,
-            "cuda",
-            torch.float16,
-            biasT,
-            *B_Mq_Mkv_H_K_Kv,
-        ),
-        packed=False,
-        fmt="BMGHK",
-        g=2,
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "opBW",
-    [
-        fmha.flash.BwOp,
-        fmha.cutlass.BwOp,
-    ],
-)
-def test_backward_gqa(opBW):
-    H = 8
-    B_Mq_Mkv_H_K_Kv = (3, 512, 512, H, 128, 128)
-    dtype = torch.float16
-    query, key, value, attn_bias = create_tensors(
-        *(opBW, "cuda", dtype, type(None), *B_Mq_Mkv_H_K_Kv),
-        attn_bias_requires_grad=False,
-        fmt="BMHK",
-    )
-    op = (fmha.cutlass.FwOp, opBW)
-    key = key[:, :, :1].expand(-1, -1, H, -1)
-    value = value[:, :, :1].expand(-1, -1, H, -1)
-    key.requires_grad_(True)
-    out = fmha.memory_efficient_attention(query, key, value, attn_bias=attn_bias)
-    out_ref = ref_attention_bmhk(query, key, value, attn_bias=attn_bias)
-    assert_allclose(
-        out.float(),
-        out_ref.float(),
-        atol=op[0].ERROR_ATOL[dtype],
-        rtol=op[0].ERROR_RTOL[dtype],
-    )
-    out.backward(query)
-    dk = key.grad
-    key.grad = None
-    out_ref.backward(query)
-    assert_allclose(
-        dk.float(),
-        key.grad.float(),
-        atol=op[1].ERROR_ATOL[dtype],
-        rtol=op[1].ERROR_RTOL[dtype],
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("opFW", [op for op in ALL_FW_OPS if op.SUPPORTS_BMGHK])
-def test_forward_gqa_one_group(opFW):
-    dtype = torch.float16
-    B, Mq, Mkv, H, K = 3, 13, 16, 5, 128
-    q = torch.randn([B, Mq, 1, H, K], dtype=dtype, device="cuda") * 3
-    k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
-    v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
-
-    supported = opFW.supports(fmha.Inputs(q, k, v))
-    if not supported:
-        supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
-        assert supported == supported_bmhk
-        pytest.skip("not supported")
-    out = fmha.memory_efficient_attention_forward(q, k, v, op=opFW)
-    ref = ref_attention(q, k, v)
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=opFW.ERROR_ATOL[dtype],
-        rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-'''
-@sm80_or_better_only
-def test_flash_gqa_wrong_strides() -> None:
-    op = (fmha.flash.FwOp, None)
-    device = "cuda"
-    B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
-    q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
-    kv = torch.empty((B, Mkv, G, H, K), dtype=torch.float16, device=device)
-    fmha.memory_efficient_attention(q, kv, kv, op=op)
-
-    kv = torch.empty((B, Mkv, H, G, K), dtype=torch.float16, device=device).permute(
-        0, 1, 3, 2, 4
-    )
-    with pytest.raises(ValueError):
-        fmha.memory_efficient_attention(q, kv, kv, op=op)
-
-    kv = torch.empty((B, Mkv, G, 1, K), dtype=torch.float16, device=device)
-    with pytest.raises(ValueError):
-        fmha.memory_efficient_attention(q, kv, kv, op=op)
-    kv = kv.expand(-1, -1, -1, H, K)
-    fmha.memory_efficient_attention(q, kv, kv, op=op)
-
-    kv = torch.empty((B, Mkv, G, H, 2 * K), dtype=torch.float16, device=device)[
-        :, :, :, :, :K
-    ]
-    fmha.memory_efficient_attention(q, kv, kv, op=op)
-'''
-
-def _dispatches_to_splitK(q, kv):
-    return (
-        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0]
-        is fmha.triton_splitk.FwOp
-    )
-
-
-def _dispatches_to_flash_decoding(q, kv):
-    return (
-        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
-    )
-
-
-def test_dispatch_decoding_bmhk() -> None:
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
-    ), "Should not use SplitK with 1 head (no tensorcores)"
-    assert _dispatches_to_flash_decoding(
-        torch.empty([1, 8, 32, 128]),
-        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
-    ), "Should use Flash-Decoding with BMHK MQA"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 32, 128]),
-        torch.empty([1, 2048, 32, 128]),
-    ), "Should not use SplitK when no TensorCores"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 128, 32, 128]),
-        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
-    ), "Should not use SplitK if q seqlen is long"
-    assert not _dispatches_to_splitK(
-        torch.empty([128, 8, 32, 128]),
-        torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
-    ), "Should not use SplitK if B is big"
-
-
-def test_dispatch_decoding_bmghk() -> None:
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
-    ), "Should not use SplitK with 1 head (no tensorcores)"
-    assert _dispatches_to_flash_decoding(
-        torch.empty([1, 8, 1, 32, 128]),
-        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should use Flash-Decoding with MQA"
-    assert _dispatches_to_flash_decoding(
-        torch.empty([1, 8, 4, 32, 128]),
-        torch.empty([1, 2048, 4, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should use Flash-Decoding with GQA"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 1, 32, 128]),
-        torch.empty([1, 2048, 1, 32, 128]),
-    ), "Should not use SplitK when no TensorCores"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 128, 1, 32, 128]),
-        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should not use SplitK if q seqlen is long"
-    assert not _dispatches_to_splitK(
-        torch.empty([128, 8, 1, 32, 128]),
-        torch.empty([128, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should not use SplitK if B is big"
-
-
-shapes_triton_splitk = [
-    (1, 8, 2**16, 1, 128, 128),
-    (1, 4, 2**16, 1, 128, 128),
-    (1, 16, 2**16, 1, 128, 128),
-    (1, 16, 2**16, 1, 32, 32),
-    (1, 8, 1025, 1, 128, 128),
-    (2, 8, 4096, 1, 128, 128),
-    (10, 8, 2**16, 1, 128, 128),
-    (10, 15, 2**16, 1, 128, 128),
-    (1, 3, 2**16, 1, 128, 128),
-    (1, 3, 2**16 - 10, 1, 128, 128),
-    (2, 3, 73, 1, 128, 128),
-    (2, 7, 7328, 1, 128, 128),
-    (2, 7, 7328, 1, 120, 120),
-    (2, 7, 63, 1, 120, 120),
-]
-op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk = [
-    (fmha.triton_splitk.FwOp, "cuda", torch.float16, type(None), *s)
-    for s in shapes_triton_splitk
-] + [
-    (fmha.triton_splitk.FwOp, "cuda", torch.bfloat16, type(None), *s)
-    for s in shapes_triton_splitk
-]
-
-
-@pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk,
-    ids=[make_id(*c) for c in op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk],
-)
-@cuda_only
-def test_forward_splitk(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed=False,
-    fmt="BMHK",
-):
-    test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed=packed, fmt=fmt)
-
-
-@cuda_only
-@pytest.mark.parametrize("op", [fmha.triton_splitk.FwOp])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize(
-    "B_Mkv_H_K",
-    [
-        (1, 2**16, 3, 128),
-        (5, 53, 4, 64),
-    ],
-)
-def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
-    B, Mkv, H, K = B_Mkv_H_K
-    q = torch.randn([B, 1, H, K], dtype=dtype, device="cuda") * 3
-    k = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
-    v = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
-    k = k.expand(-1, -1, H, -1)
-    v = v.expand(-1, -1, H, -1)
-
-    if not op.supports(fmha.Inputs(q, k, v)):
-        pytest.skip("not supported")
-    out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
-    ref = ref_attention(q, k, v)
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_empty_tensors_empty_query(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK",
-    )
-    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
-
-    query = query[:, :0]
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
-    assert out.shape[1] == 0
-    out.backward(out)
-    # dK/dV should be all zeros
-    assert_allclose(key.grad, torch.zeros_like(key.grad), "key.grad")
-    assert_allclose(value.grad, torch.zeros_like(value.grad), "value.grad")
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_empty_tensors_empty_kv(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK",
-    )
-    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
-
-    key = key[:, :0]
-    value = value[:, :0]
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
-    assert_allclose(out, torch.zeros_like(out), "out")
-    out.backward(out)
-    # dQ should be all zeros
-    assert_allclose(query.grad, torch.zeros_like(query.grad), "query.grad")
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_empty_tensors_empty_b(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK",
-    )
-    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
-
-    query, key, value = query[:0], key[:0], value[:0]
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
-    out.backward(out)
-
-
-def test_local_attn_bias() -> None:
-    mask = (
-        fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
-        .materialize(shape=(4, 4))
-        .exp()
-    )
-
-    expected = torch.tensor(
-        [[1, 1, 1, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.float32
-    )
-    assert (mask == expected).all().item()
-
-
-@cuda_only
-@pytest.mark.parametrize("cc", [60, 70, 80])
-@pytest.mark.parametrize("maxK", [32, 64, 128, 256])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
-@pytest.mark.parametrize(
-    "custom_mask_type",
-    [
-        fmha.cutlass._CustomMaskType.NoCustomMask,
-        fmha.cutlass._CustomMaskType.CausalFromTopLeft,
-        fmha.cutlass._CustomMaskType.CausalFromBottomRight,
-    ],
-)
-@pytest.mark.parametrize("window_size", [0, 3, 300])
-@pytest.mark.parametrize(
-    "num_queries,num_keys",
-    [
-        (30, 66),
-        (256, 256),
-        # Edge cases
-        (314, 320),
-        (32, 256),
-        (224, 226),
-        (5, 531),
-        (320, 332),  # for win_size=300
-        # Others
-        (256, 62),
-        (256, 63),
-        (256, 64),
-        (256, 65),
-        (256, 66),
-    ],
-)
-def test_cutlassB_iter_order(
-    dtype,
-    cc: int,
-    maxK: int,
-    num_queries: int,
-    num_keys: int,
-    custom_mask_type,
-    window_size,
-) -> None:
-    """
-    This tests some internals of the cutlassB kernel
-    We test the iteration across blocks of [queries, keys] to ensure
-    that we correctly:
-    * Iterate over all the blocks that should be iterated
-    * Do *not* iterate over blocks that are completely masked out
-    * Correctly compute the number of parallel blocks that will compute
-        the same block of dQ
-    .. and we test this across variable causal masks+local attention combinations
-    """
-    if (
-        window_size > 0
-        and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
-    ):
-        pytest.skip("LocalAttention is only supported for causal")
-    get_iteration_data = partial(
-        torch.ops.xformers._cutlassB_iteration_data,
-        dtype=dtype,
-        cc=cc,
-        maxK=maxK,
-        num_queries=num_queries,
-        num_keys=num_keys,
-        custom_mask_type=custom_mask_type,
-        window_size=window_size,
-    )
-    bias = torch.zeros([num_queries, num_keys], dtype=torch.float32)
-    if custom_mask_type != fmha.cutlass._CustomMaskType.NoCustomMask:
-        bias = fmha.attn_bias._materialize_causal_mask(
-            (num_queries, num_keys),
-            dtype=torch.float32,
-            device="cpu",
-            window_size=None if window_size == 0 else window_size,
-            from_bottomright=(
-                custom_mask_type == fmha.cutlass._CustomMaskType.CausalFromBottomRight
-            ),
-        )
-
-    block_queries, block_keys = get_iteration_data()[:2]
-    mask_pooled = (
-        F.max_pool2d(bias.unsqueeze(0), (block_queries, block_keys), ceil_mode=True)
-        == 0
-    ).int()[0]
-    attn_computed = torch.zeros_like(mask_pooled)
-    for key_start in range(0, num_keys, block_keys):
-        it = 0
-        new_key_start = key_start
-        new_query_start = get_iteration_data(key_start=key_start)[2]
-        try:
-            expected_first_query = (
-                mask_pooled[:, key_start // block_keys].tolist().index(1)
-                * block_queries
-            )
-            assert (
-                new_query_start == expected_first_query
-            ), f"Wrong first query for K={key_start}: {new_query_start} (expected {expected_first_query})"
-        except ValueError:  # Nothing to compute in this column
-            pass
-
-        while new_key_start == key_start and new_query_start < num_queries:
-            query_start = new_query_start
-            attn_computed[query_start // block_queries, key_start // block_keys] += 1
-            # print(f"Compute [{query_start}, {key_start}]")
-
-            # Is there something to compute here?
-            assert mask_pooled[
-                query_start // block_queries, key_start // block_keys
-            ].item(), "Computing a block that is not needed!"
-            new_query_start, new_key_start = get_iteration_data(
-                key_start=key_start, query_start=query_start
-            )[3:5]
-            it += 1
-            assert it < num_queries, ""
-        assert (attn_computed == mask_pooled)[
-            :, key_start // block_keys
-        ].all(), "some blocks were not computed!"
-
-    # Now check that the number returned by `getNumParallelBlocksForQuery` is correct
-    for query_start in range(0, num_queries, block_queries):
-        num_parallel_blocks = get_iteration_data(
-            query_start=query_start, num_splits_key=num_keys
-        )[5]
-        num_actual = mask_pooled[query_start // block_queries].sum().item()
-        assert num_parallel_blocks == num_actual
-# end of file

From 34466be90735ce36d8ef3073bf904a3e372c1f9a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 1 Feb 2024 17:12:30 +0000
Subject: [PATCH 401/641] Update to test_mqa_forward_ck_tiled.py to use common
 create_attn_bias method

---
 tests/test_mqa_forward_ck_tiled.py | 482 +----------------------------
 1 file changed, 6 insertions(+), 476 deletions(-)

diff --git a/tests/test_mqa_forward_ck_tiled.py b/tests/test_mqa_forward_ck_tiled.py
index e3c1f488c..7bdb75ae2 100644
--- a/tests/test_mqa_forward_ck_tiled.py
+++ b/tests/test_mqa_forward_ck_tiled.py
@@ -15,6 +15,7 @@
 import xformers.ops
 from xformers.ops import fmha
 from xformers.ops.fmha.common import AttentionOpBase
+from xformers.attn_bias_utils import create_attn_bias
 
 from .utils import assert_allclose
 
@@ -32,181 +33,6 @@
     fmha.ck.FwOp,
 ]
 
-ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
-    fmha.ck.BwOp,
-]
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    for B in op._TEST_BATCH_SIZES:
-        for Mq in [32, 256]:
-            for Mkv in [32, 64, 256, 1024]:
-                for K in op._TEST_K:
-                    shapes.append((B, Mq, Mkv, 1, K, K))
-        Mq = 256
-        Mkv = 128
-        K = 32
-        H = 1
-        # Weird values of parameters
-        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
-            shapes.append((B, M, Mkv, H, K, K))
-            shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
-            if _K <= op.SUPPORTED_MAX_K:
-                shapes.append((B, Mq, Mkv, H, _K, _K))
-        # Different value for K / Kv
-        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-            for _K in [32, 36, 64, 256 + 8]:
-                shapes.append((B, Mq, Mkv, H, K, _K))
-                shapes.append((B, Mq, Mkv, H, _K, K))
-        # Exotic sizes
-        for K in op._TEST_K:
-            shapes.append((B, 16, 1024, H, K, K))
-            shapes.append((B, 1024, 16, H, K, K))
-        # Some number of heads
-        for H in [3, 5, 12]:
-            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
-    # Filter-out not supported shapes
-    shapes = [
-        shape
-        for shape in shapes
-        if len(
-            op.shape_not_supported_reasons(
-                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
-            )
-        )
-        == 0
-    ]
-    # Add some random shapes
-    if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        found_count = 0
-        while found_count < 20:
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
-                continue
-            found_count += 1
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def make_id(op, device, dtype, bias_type, *shape):
-    return (
-        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-        f"-{'-'.join([str(s) for s in shape])}"
-    )
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-    return {
-        "argvalues": combination,
-        "ids": [make_id(*c) for c in combination],
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
-)
-
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
     if q.ndim == 4:
         B, M, Hq, K = q.shape
@@ -294,305 +120,13 @@ def T(t):
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        if fmt == "BMK":
-            attn_bias = attn_bias[:, 0]
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
-def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
-    tensor_with_grad: Optional[torch.Tensor] = None
-    if isinstance(attn_bias, torch.Tensor):
-        tensor_with_grad = attn_bias
-    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-        tensor_with_grad = attn_bias._bias
-    if tensor_with_grad is not None:
-        grad = tensor_with_grad.grad
-        if clear:
-            tensor_with_grad.grad = None
-        return grad
-    return None
-
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
 @pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
 @pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
-@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000), (400, 300)])
+@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000)])
 @pytest.mark.parametrize("batches", [100, 64, 1])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
-@pytest.mark.parametrize("op", [fmha.ck.FwOp])
+@pytest.mark.parametrize("op", ALL_FW_OPS)
 def test_mqa_forward(
     op,
     attn_bias_type,
@@ -612,16 +146,11 @@ def test_mqa_forward(
     Hkv = nhead_kv
     K = hdim_k
     Kv = hdim_v
-
-    print("Hq=", Hq, "Hkv=", Hkv)
+    nhead_ratio_qk = Hq // Hkv
 
     device = torch.device("cuda")
 
-    if not (K == Kv and (Kv == 64 or Kv == 128)):
-        pytest.skip("only head-dim size 64 or 128 supported by ck-tiled!")
-
-    if Kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention")
+    torch.manual_seed(B * M + N * K + Hq*Hkv + Kv)
 
     scale = 3
     query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
@@ -634,6 +163,7 @@ def test_mqa_forward(
             attn_bias_type,
             batch_size=B,
             num_heads=Hq,
+            num_heads_groups=nhead_ratio_qk,
             q_len=M,
             kv_len=N,
             dtype=dtype,

From 351c7665a2353a612862451498364d34671d1a92 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 1 Feb 2024 18:07:32 +0000
Subject: [PATCH 402/641] Add ck-tiled checking in test_mqa_forward_ck_tiled.py

---
 tests/test_mqa_forward_ck_tiled.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_mqa_forward_ck_tiled.py b/tests/test_mqa_forward_ck_tiled.py
index 7bdb75ae2..5d11b8e40 100644
--- a/tests/test_mqa_forward_ck_tiled.py
+++ b/tests/test_mqa_forward_ck_tiled.py
@@ -14,6 +14,7 @@
 
 import xformers.ops
 from xformers.ops import fmha
+from xformers.ops.common import get_xformers_operator
 from xformers.ops.fmha.common import AttentionOpBase
 from xformers.attn_bias_utils import create_attn_bias
 
@@ -33,6 +34,10 @@
     fmha.ck.FwOp,
 ]
 
+### ck_check_op is temporarily used to check ck-tiled availability
+ck_check_op = get_xformers_operator("is_ck_tiled_used")
+use_ck_tiled = ck_check_op()
+
 def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
     if q.ndim == 4:
         B, M, Hq, K = q.shape
@@ -150,6 +155,9 @@ def test_mqa_forward(
 
     device = torch.device("cuda")
 
+    if not use_ck_tiled:
+        pytest.skip("mqa/gqa is only supported with ck-tiled")
+
     torch.manual_seed(B * M + N * K + Hq*Hkv + Kv)
 
     scale = 3

From b58b4ed8b04fe9440c00ce2cf00ff6d1d7f713f4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 2 Feb 2024 01:45:17 +0000
Subject: [PATCH 403/641] rearrange smem access in softmax reduction

---
 .../hip_fmha/ck_attention_forward_decoder_splitk.h   | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 316a5d497..d4becb4b5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -361,7 +361,9 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
             (split_idx + 1 < split_k) ? n_unrolled_loops * dtt * (split_idx + 1) : t_max;
         for(int32_t t = t_low + thread_linear_idx; t < t_high; t += threads_per_block)
         {
-            softmax_denominator += ck::math::exp(smem[t - t_low] - max_qk_acc);
+            const auto s = ck::math::exp(smem[t - t_low] - max_qk_acc);
+            softmax_denominator += s;
+            smem[t - t_low] = s;
         }
         softmax_denominator =
             wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
@@ -385,14 +387,6 @@ efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict_
         {
             split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
         }
-
-        // now, compute the normalization across all threads.
-        for(int32_t t = t_low + thread_linear_idx; t < t_high; t += threads_per_block)
-        {
-            // softmax scale by sumexp will happen in the reduction kernel
-            smem[t - t_low] = ck::math::exp(smem[t - t_low] - max_qk_acc);
-        }
-        __syncthreads();
     } // softmax reduce end
 
     // Split T across wavefronts in a block

From 21062d171c2ab7db48009e08aae97a70cc33f9c2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 15:53:30 +0000
Subject: [PATCH 404/641] Add test_decoder and test_splitk_decoder for ROCM
 into test_mem_eff_attention.py

---
 tests/test_mem_eff_attention.py | 60 +++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 2b841e641..a5f0b3e74 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -25,6 +25,7 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+rocm_only = pytest.mark.skipif(not torch.cuda.is_available() or not torch.version.hip, reason="requires ROCM")
 compute_capability = (0, 0)
 if torch.cuda.is_available():
     compute_capability = torch.cuda.get_device_capability("cuda")
@@ -1549,7 +1550,7 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
 @pytest.mark.parametrize(
     "op",
     [
-        fmha.decoder.FwOp,
+        fmha.decoder.FwOp if torch.version.cuda else fmha.ck_decoder.FwOp,
     ],
 )
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
@@ -1565,6 +1566,7 @@ def test_decoder(
     dtype: str,
     dequant: bool = False,
     num_queries: int = 1,
+    d: int = 128,
 ) -> None:
     # kv_heads = 1: multiquery
     # kv_heads = None: neither MQA nor GQA
@@ -1573,7 +1575,6 @@ def test_decoder(
         raise pytest.skip("BF16 is only supported on SM80+")
     dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
     torch.manual_seed(1)
-    d = 128
     if kv_heads is not None and kv_heads > 1:
         k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
         q_shape: Tuple[int, ...] = (
@@ -1630,15 +1631,26 @@ def dequant_cache(x):
         k = dequant_cache(k)
         v = dequant_cache(v)
 
-    cutlass_output = fmha.memory_efficient_attention_forward(
-        q, k, v, attn_bias, op=fmha.cutlass.FwOp
-    )
-    assert_allclose(
-        decoder_output,
-        cutlass_output,
-        atol=fmha.cutlass.FwOp.ERROR_ATOL[dtype_] * 4,
-        rtol=fmha.cutlass.FwOp.ERROR_RTOL[dtype_],
-    )
+    if torch.version.cuda:
+        cutlass_output = fmha.memory_efficient_attention_forward(
+             q, k, v, attn_bias, op=fmha.cutlass.FwOp
+        )
+
+        assert_allclose(
+            decoder_output,
+            cutlass_output,
+            atol=fmha.cutlass.FwOp.ERROR_ATOL[dtype_] * 4,
+            rtol=fmha.cutlass.FwOp.ERROR_RTOL[dtype_],
+        )
+    else:
+        ref_output = ref_attention(q, k, v, attn_bias)
+
+        assert_allclose(
+            decoder_output.float(),
+            ref_output,
+            atol=fmha.cutlass.FwOp.ERROR_ATOL[dtype_] * 4,
+            rtol=fmha.cutlass.FwOp.ERROR_RTOL[dtype_],
+        )
 
 
 @sm80_or_better_only
@@ -1686,6 +1698,32 @@ def test_triton_splitk_decoder(
         dequant=dequant,
     )
 
+@rocm_only
+@pytest.mark.parametrize("op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4])
+@pytest.mark.parametrize("dtype", ["f32"])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("n_heads", [16])
+@pytest.mark.parametrize("d", [128, 256])
+@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
+def test_splitk_decoder(
+    op,
+    kv_heads: Optional[int],
+    n_heads: int,
+    padding: int,
+    bsz: int,
+    dtype: str,
+    d: int
+) -> None:
+    # no quantized impl compared to cuda
+    test_decoder(
+        op,
+        kv_heads=kv_heads,
+        n_heads=n_heads,
+        padding=padding,
+        bsz=bsz,
+        dtype=dtype,
+        d=d,
+    )
 
 def test_attn_bias_from_seqlens() -> None:
     bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])

From df7d52339699e64e51a1fbd0f20b73b5a1447c5a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 16:14:16 +0000
Subject: [PATCH 405/641] Add ref_attention_splitk and its test to
 tests/test_mem_eff_attention.py

---
 tests/test_mem_eff_attention.py | 174 ++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index a5f0b3e74..9230ee5d1 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -310,6 +310,127 @@ def T(t):
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
+def ref_attention_splitk_bmhk(q, k, v, attn_bias, scale=None, split_k=None, dtype=None) -> torch.Tensor:
+    assert q.ndim == 4
+
+    def T(t):
+        return t.permute((0, 2, 1, 3)).reshape(
+            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+        )
+
+    if isinstance(attn_bias, xformers.ops.AttentionBias):
+        attn_bias = attn_bias.materialize(
+            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
+            device=q.device,
+            dtype=torch.float32,
+        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
+    out = ref_attention_splitk(T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3))
+
+def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2, dtype=None) -> torch.Tensor:
+    if q.ndim == 5:
+        def attn_bias_group(group: int):
+            if isinstance(attn_bias, torch.Tensor):
+                return attn_bias[:, group]
+            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                    attn_bias._bias[:, group]
+                )
+            return attn_bias
+
+        return torch.stack(
+            [
+                ref_attention_splitk_bmhk(
+                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), split_k=split_k, dtype=dtype
+                )
+                for g in range(q.shape[2])
+            ],
+            dim=2,
+        )
+
+    if q.ndim == 4:
+        return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype)
+    assert q.ndim == 3
+    if dtype is None:
+        dtype = torch.float32
+    q = q.to(dtype=dtype)
+    k = k.to(dtype=dtype)
+    v = v.to(dtype=dtype)
+
+    if scale is None:
+        scale = q.shape[-1] ** -.5
+    assert not q.isnan().any()
+    q = q * scale
+    assert not q.isnan().any()
+
+    if attn_bias is not None:
+        if isinstance(attn_bias, xformers.ops.AttentionBias):
+            # Always create in B,H,Mq,Mk format
+            attn_bias_tensor = attn_bias.materialize(
+                (q.shape[0], 1, q.shape[1], k.shape[1]),
+                device=q.device,
+                dtype=torch.float32,
+            )
+        else:
+            attn_bias_tensor = attn_bias
+        if attn_bias_tensor.ndim == 4:
+            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
+            attn_bias_tensor = attn_bias_tensor.reshape(
+                [-1, *attn_bias_tensor.shape[2:]]
+            )
+
+    split_size = k.size(-2) // split_k
+    split_config = { "dim": -2, "split_size_or_sections": split_size}
+    k_split = torch.split(k, **split_config)
+    v_split = torch.split(v, **split_config)
+    attn_bias_split = torch.split(attn_bias_tensor, dim=-1, split_size_or_sections=split_size)
+
+    def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
+        p_slice = q_whole @ k_slice.transpose(-2, -1)
+        p_slice += attn_bias_slice
+        m = torch.max(p_slice, dim = -1, keepdim=True).values
+        p_slice_scaled = p_slice - m
+        p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
+        s = torch.exp(p_slice_scaled)
+        l = torch.sum(s, dim=-1, keepdim=True)
+        attn_slice = s @ v_slice
+        return {
+            "attn_slice": attn_slice,
+            "row_max": m,
+            "row_lse": l,
+        }
+
+    splits = list(zip(k_split, v_split, attn_bias_split))
+
+    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]),
+        splits))
+    out = torch.zeros_like(q)
+
+    # reduce out over split-k slices
+
+    global_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
+    global_sumexp = torch.zeros_like(slices[0]["row_lse"])
+
+    for s in slices:
+        local_out = s["attn_slice"]
+        local_max = s["row_max"]
+        local_sumexp = s["row_lse"]
+
+        log_alpha = -torch.abs(local_max - global_max)
+        alpha = torch.exp(log_alpha)
+        alpha.nan_to_num_(1.)
+
+        pick_new = local_max < global_max
+        new_coef = torch.where(pick_new, alpha, 1.)
+        curr_coef = torch.where(pick_new, 1., alpha)
+
+        out = out * curr_coef + local_out * new_coef
+        global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
+        global_max = torch.max(local_max, global_max)
+    out /= global_sumexp
+    return out
+
 
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
@@ -1546,6 +1667,59 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
     return f"gqa{kv_heads}"
 
 
+@pytest.mark.parametrize("dtype", ["f32"])
+@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
+@pytest.mark.parametrize("n_heads", [16])
+@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
+@pytest.mark.parametrize("split_k", [1, 2, 4])
+def test_splitk_reference(
+    kv_heads: int, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
+):
+    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
+    torch.manual_seed(1)
+    d = 256
+    num_queries = 1
+    if kv_heads is not None and kv_heads > 1:
+        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
+        q_shape: Tuple[int, ...] = (
+            1,
+            bsz * num_queries,
+            kv_heads,
+            n_heads,
+            d,
+        )
+    else:
+        k_shape = (1, bsz * padding, n_heads, d)
+        q_shape = (1, bsz * num_queries, n_heads, d)
+
+    k = torch.rand(k_shape, dtype=dtype_).cuda()
+    k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
+    v = torch.rand_like(k)
+    q = torch.rand(q_shape, dtype=dtype_).cuda()
+    causal_diagonal = torch.tensor(  # TODO: make unnecessary
+        [i - 1 for i in k_seqlen], dtype=torch.int32
+    ).cuda()
+
+    if kv_heads is not None:
+        k = k[..., :1, :].expand(k_shape)
+        v = v[..., :1, :].expand(k_shape)
+
+    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+        q_seqlen=[1] * bsz,
+        kv_seqlen=k_seqlen,
+        causal_diagonal=causal_diagonal,
+        kv_padding=padding,
+    )
+    ref_out = ref_attention(q, k, v, attn_bias)
+    splitk_out = ref_attention_splitk(q, k, v, attn_bias, None, split_k=split_k)
+    assert_allclose(
+        ref_out,
+        splitk_out,
+        atol=fmha.ck.FwOp.ERROR_ATOL[dtype_],
+        rtol=fmha.ck.FwOp.ERROR_RTOL[dtype_],
+    )
+
+
 @sm70_or_better_only
 @pytest.mark.parametrize(
     "op",

From ee633c8bd07fc378eef3e192de673e2bb4236c75 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 16:19:11 +0000
Subject: [PATCH 406/641] Rename test_mem_eff_attention_ck.py as discarded

---
 ...eff_attention_ck.py => test_mem_eff_attention_ck_discarded.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_mem_eff_attention_ck.py => test_mem_eff_attention_ck_discarded.py} (100%)

diff --git a/tests/test_mem_eff_attention_ck.py b/tests/test_mem_eff_attention_ck_discarded.py
similarity index 100%
rename from tests/test_mem_eff_attention_ck.py
rename to tests/test_mem_eff_attention_ck_discarded.py

From 2df5ed3949808957bf6417d43c70186a69fd648c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 20:34:23 +0000
Subject: [PATCH 407/641] Add test_mqa_forward and ref_attention_mqa (for BMHK
 format mqa/gqa verification) into test_mem_eff_attention.py

---
 tests/test_mem_eff_attention.py | 126 ++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 9230ee5d1..355571ad5 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -17,6 +17,7 @@
 import xformers.ops
 from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
+from xformers.ops.common import get_xformers_operator
 from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
 from xformers.ops.fmha.common import AttentionOpBase
 from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
@@ -431,6 +432,42 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     out /= global_sumexp
     return out
 
+## this interface assumes the tensor is in BMHK, but q and k/v might has different number of heads
+def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
+    assert q.ndim == 4
+
+    B, M, Hq, K = q.shape
+    _, N, Hkv, Kv = v.shape
+    nhead_ratio_qk = Hq // Hkv
+
+    def attn_bias_head(head: int):
+        if isinstance(attn_bias, torch.Tensor):
+            assert attn_bias.ndim == 4
+            _, H, _, _ = attn_bias.shape
+            assert H == Hq
+            bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+            return bias_bghmn[:, :, head]
+        if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
+            assert attn_bias._bias.ndim == 4
+            _, H, _, _ = attn_bias._bias.shape
+            assert H == Hq
+            bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
+            return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
+                bias_bghmn[:, :, head]
+            )
+        return attn_bias
+
+    q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
+
+    return torch.stack(
+           [
+              ref_attention_bmhk(
+                   q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h),
+              )
+              for h in range(q_bmghk.shape[3])
+           ],
+           dim=3,
+       ).reshape((B, M, Hq, Kv))
 
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
@@ -643,6 +680,95 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
+@rocm_only
+@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
+@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
+@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000)])
+@pytest.mark.parametrize("batches", [100, 64, 1])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize("op", [fmha.ck.FwOp])
+def test_mqa_forward(
+    op,
+    attn_bias_type,
+    dtype,
+    batches: int,
+    seqlen_kv: int,
+    seqlen_q: int,
+    nhead_kv: int,
+    nhead_q: int,
+    hdim_v: int,
+    hdim_k: int,
+):
+    B = batches
+    M = seqlen_q
+    N = seqlen_kv
+    Hq = nhead_q
+    Hkv = nhead_kv
+    K = hdim_k
+    Kv = hdim_v
+    nhead_ratio_qk = Hq // Hkv
+
+    device = torch.device("cuda")
+
+    ### ck_check_op is temporarily used to check ck-tiled availability
+    ck_check_op = get_xformers_operator("is_ck_tiled_used")
+    use_ck_tiled = ck_check_op()
+
+    if not use_ck_tiled:
+        pytest.skip("mqa/gqa is only supported with ck-tiled")
+
+    torch.manual_seed(B * M + N * K + Hq*Hkv + Kv)
+
+    scale = 3
+    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
+    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
+    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
+
+    attn_bias = None
+    if attn_bias_type is not None:
+        attn_bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=Hq,
+            num_heads_groups=nhead_ratio_qk,
+            q_len=M,
+            kv_len=N,
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+            fmt="BMHK",
+            op=op,
+        )
+
+    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
+    reasons = op.not_supported_reasons(inputs)
+    if reasons:
+        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+        # Ensure we free memory to avoid OOMs
+        del query, key, value, attn_bias, inputs
+
+    out = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
+    out2 = xformers.ops.memory_efficient_attention_forward(
+        query, key, value, attn_bias, op=op
+    )
+    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
+        "Non-deterministic behavior",
+        attn_bias,
+    )
+
+    ref = ref_attention_mqa(query, key, value, attn_bias)
+    assert out.shape == ref.shape, out.shape
+    assert_allclose(
+        out.float(),
+        ref,
+        atol=op.ERROR_ATOL[dtype],
+        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
+    )
+
 
 @cuda_only
 @pytest.mark.parametrize("k_len", [5, 6, 32])

From 7d1219b10b99508baeebe880f4eda38cb116f0af Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 20:40:11 +0000
Subject: [PATCH 408/641] Rename test_mqa_forward_ck_tiled.py as discarded

---
 ...forward_ck_tiled.py => test_mqa_forward_ck_tiled_discarded.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_mqa_forward_ck_tiled.py => test_mqa_forward_ck_tiled_discarded.py} (100%)

diff --git a/tests/test_mqa_forward_ck_tiled.py b/tests/test_mqa_forward_ck_tiled_discarded.py
similarity index 100%
rename from tests/test_mqa_forward_ck_tiled.py
rename to tests/test_mqa_forward_ck_tiled_discarded.py

From fe6f96e2a21cc1cd2f141d349fe608a2e5bfdfa1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 20:49:18 +0000
Subject: [PATCH 409/641] Remove CK specific script
 benchmark_mem_eff_attn_decoder_ck.py

---
 .../benchmark_mem_eff_attn_decoder_ck.py      | 208 ------------------
 1 file changed, 208 deletions(-)
 delete mode 100644 xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
deleted file mode 100644
index 86d4813cf..000000000
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder_ck.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import itertools
-from functools import partial
-
-import torch
-from torch.utils import benchmark
-from utils import benchmark_main_helper
-
-import xformers.ops
-import xformers.ops.fmha as fmha
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-# Run with
-#  python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py --omit-baselines --quiet
-# The baselines for these benchmarks are really slow because there is
-# so much padding in the inputs, so there is no point running them.
-
-
-def ref_attention_bmk(q, k, v, attn_bias=None):
-    if isinstance(attn_bias, xformers.ops.AttentionMask):
-        attn_bias = (
-            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
-            .to(q)
-            .squeeze()
-        )
-    q = q * (1.0 / q.shape[-1] ** 0.5)
-    if attn_bias is None:
-        attn = q @ k.transpose(-2, -1)
-    else:
-        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
-        # but faster, and is what is used in PyTorch now
-        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
-    attn = attn.softmax(-1)
-    return attn @ v
-
-
-def ref_attention(q, k, v, attn_bias):
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-min_run_time = 0.5
-device = torch.device("cuda")
-
-NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
-
-OPS = [
-    xformers.ops.fmha.ck.FwOp,
-    xformers.ops.fmha.ck_decoder.FwOp
-]
-
-KV_SHAPES = [
-    # list of n_keys, padding_length, batchsize
-    (2, 64, 3),
-    (32, 1024, 500),  
-    (1000, 1024, 2),
-    (8000, 8192, 1),
-    (240, 256, 32),
-    (2048, 2 * 1024, 4),
-    (4096 * 2, 8 * 1024, 1),
-]
-
-N_HEADS = [8, 16, 64]
-
-
-def product_dict(**kwargs):
-    keys = kwargs.keys()
-    vals = kwargs.values()
-    for instance in itertools.product(*vals):
-        yield dict(zip(keys, instance))
-
-
-CASES = list(
-    product_dict(
-        kv_shape=KV_SHAPES,
-        n_heads=N_HEADS,
-        num_threads=NUM_THREADS,
-        multiquery=[True, False],
-    )
-)
-
-def get_memory_traffic(op, q, k, v, bias):
-    # mem_size = ( batch_size * seq_len * 1 * dim_per_head * 2 (K/V) + 
-    #              batch_size * 1 * num_heads * dim_per_head (Q) + 
-    #              batch_size * seq_len * num_heads * dim_per_head (attn_output) ) * bytes_per_element
-    out = xformers.ops.memory_efficient_attention_forward(q, k, v, bias, op=op)
-    dtype = q.dtype
-    multiquery = k.stride(2) == 0
-    n_heads = q.shape[-2]
-    dim_per_head = q.shape[-1]
-    kv_seqlen = bias.k_seqinfo.seqlen_py
-    bytes_per_element = 4 if dtype is torch.float32 else 2 if dtype in (torch.float16, torch.bfloat16) else None
-    mem_size = 0
-    mem_size += q.numel() * bytes_per_element # Q
-    for s in kv_seqlen: # len(kv_seqlen) == batch_size
-        mem_size += s * (1 if multiquery else n_heads) * dim_per_head * bytes_per_element * 2 # K, V
-    mem_size += out.numel() * bytes_per_element # attn_output
-    return mem_size
-
-def mem_eff_attention_decoder(
-    kv_shape, n_heads: int, num_threads: int, multiquery: bool
-):
-    n_keys, padding, B = kv_shape
-    torch.manual_seed(42)
-    k_seqlen = torch.randint(1, n_keys + 1, (B,)).tolist()
-    K = 128
-    dtype = torch.bfloat16
-    q = torch.rand(1, B, n_heads, K, device=device, dtype=dtype)
-    if multiquery:
-        k = torch.rand(
-            1, B * padding, 1, K, device=device, dtype=dtype
-        ).expand(1, B * padding, n_heads, K)
-        v = torch.rand(
-            1, B * padding, 1, K, device=device, dtype=dtype
-        ).expand(1, B * padding, n_heads, K)
-    else:
-        k = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
-        v = torch.rand(1, B * padding, n_heads, K, device=device, dtype=dtype)
-
-    bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[1] * B,
-        kv_seqlen=k_seqlen,
-        kv_padding=padding,
-    )
-
-    sub_label = f"{B}batch-{k_seqlen[0]}keys-{n_heads}heads"
-    if multiquery:
-        sub_label += "-mq"
-
-    has_run = False
-
-    for fw_op in OPS:
-        inp = fmha.Inputs(q, k, v, attn_bias=bias)
-        if (skip_reasons := fw_op.not_supported_reasons(inp)):
-            print(f"Skip benchmark: {skip_reasons=}")
-            continue
-
-        fn = partial(xformers.ops.memory_efficient_attention_forward, op=fw_op)
-
-        mem_size = get_memory_traffic(fw_op, q, k, v, bias)
-
-        yield benchmark.Timer(
-            stmt=f"fn(q, k, v, attn_bias)",
-            globals={
-                "q": q,
-                "k": k,
-                "v": v,
-                "attn_bias": bias,
-                "fn": fn,
-            },
-            label="attention",
-            description=fw_op.NAME,
-            sub_label=f"{sub_label}_{mem_size//1024}k",
-            num_threads=num_threads,
-        )
-
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph):
-            fn(q, k, v, bias)
-        yield benchmark.Timer(
-            stmt="graph.replay()",
-            globals={
-                "graph": graph,
-            },
-            label="cuda graphed attention",
-            description=fw_op.NAME,
-            sub_label=f"{sub_label}_{mem_size//1024}k",
-            num_threads=num_threads,
-        )
-
-        has_run = True
-
-    if not has_run:
-        return
-
-    RUN_BASELINES = False
-    if RUN_BASELINES:
-        yield benchmark.Timer(
-            stmt="fn(q, k, v, attn_bias)",
-            globals={
-                "q": q,
-                "k": k,
-                "v": v,
-                "attn_bias": bias,
-                "fn": ref_attention,
-            },
-            label="attention",
-            description="eager",
-            sub_label=sub_label,
-            num_threads=num_threads,
-        )
-
-
-benchmark_main_helper(mem_eff_attention_decoder, CASES, min_run_time=min_run_time)

From 5af967c74ae1ff40e5d3aecceab422ef3d4fcfe8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 21:34:59 +0000
Subject: [PATCH 410/641] Refine benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py

---
 tests/test_mem_eff_attention.py               |  2 +-
 ...benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py | 62 ++++++-------------
 2 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 355571ad5..aee582c38 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -432,7 +432,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     out /= global_sumexp
     return out
 
-## this interface assumes the tensor is in BMHK, but q and k/v might has different number of heads
+## this interface assumes the tensor is in BMHK, but q and k/v might have different number of heads
 def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
     assert q.ndim == 4
 
diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
index 69b092788..12b8f7b91 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
@@ -15,31 +15,12 @@
 import xformers.ops
 import xformers.ops.fmha as fmha
 
-torch.backends.cuda.matmul.allow_tf32 = False
+from xformers.attn_bias_utils import create_attn_bias
 
+torch.backends.cuda.matmul.allow_tf32 = False
 
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    bias_requires_grad: bool = False,
-):
-    NoneType = type(None)
-    if bias_type is NoneType:
-        return None
-    if bias_type is torch.Tensor:
-        attn_bias = torch.randn((1, 1, q_len, kv_len), device=device, dtype=dtype)
-        return attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return bias_type()
-    assert False, f"Unsupported bias type: {bias_type}"
-
-## ref_attention is completely the same as used by test_forward_ck_tiled.py
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+## this interface assumes the tensor is in BMHK, but q and k/v might has different number of heads
+def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
     if q.ndim == 4:
         B, M, Hq, K = q.shape
         _, N, Hkv, Kv = v.shape
@@ -122,7 +103,7 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
+    out = ref_attention_mqa(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
@@ -147,11 +128,11 @@ def T(t):
 ]
 
 OPS = [
-    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
-    #(xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
+    xformers.ops.fmha.ck.FwOp,
+    xformers.ops.fmha.flash.FwOp,
     # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
     # and its performance varies a lot between runs.
-    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
+    ##xformers.ops.fmha.triton.FwOp,
 ]
 
 
@@ -167,7 +148,7 @@ def product_dict(**kwargs):
         shape=SHAPES,
         num_threads=NUM_THREADS,
         dropout_p=[0.0],
-        attn_bias_cfg=[(type(None), False)],
+        attn_bias_type=[type(None)],
         dtype=[torch.half, torch.bfloat16],
     )
 )
@@ -178,12 +159,8 @@ def product_dict(**kwargs):
     c.update(
         random.Random(str(c["shape"])).choice(
             [
-                ##{"dropout_p": 0.3},
-                {"attn_bias_cfg": (torch.Tensor, False)},
-                ##{"attn_bias_cfg": (torch.Tensor, True)},
-                {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
-                ##{"dtype": torch.bfloat16},
-                ##{"dtype": torch.float},
+                {"attn_bias_type": torch.Tensor},
+                {"attn_bias_type": xformers.ops.LowerTriangularMask},
             ]
         )
     )
@@ -197,21 +174,22 @@ def create_tensors(shape, dtype, requires_grad=False):
     v = torch.rand([B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad)
     return q, k, v
 
-def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
+def mem_eff_attention_fw(shape, num_threads: int, attn_bias_type, dropout_p, dtype):
     B, M, N, Hq, Hkv, K = shape
+    nhead_ratio_qk = Hq // Hkv
     q, k, v = create_tensors(shape, dtype)
-    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
-    if attn_bias_requires_grad:
-        return
     bias = create_attn_bias(
         attn_bias_type,
         batch_size=B,
         num_heads=Hq,
+        num_heads_groups=nhead_ratio_qk,
         q_len=M,
         kv_len=N,
         device=device,
         dtype=dtype,
-        bias_requires_grad=attn_bias_requires_grad,
+        requires_grad=False,
+        fmt="BMHK",
+        op=fmha.ck.FwOp,  ## only required as a refer op by create_attn_bias
     )
     inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
 
@@ -226,7 +204,7 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
     )
 
     has_run = False
-    for fw_op, bw_op in OPS:
+    for fw_op in OPS:
         if not fw_op.supports(inp):
             continue
 
@@ -239,7 +217,7 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
                 "attn_bias": inp.attn_bias,
                 "p": dropout_p,
                 "fn": partial(
-                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
+                    xformers.ops.memory_efficient_attention_forward, op=fw_op
                 ),
             },
             label=f"attention (attn_bias={attn_bias_type})",
@@ -260,7 +238,7 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtyp
             "v": v,
             "attn_bias": inp.attn_bias,
             "p": dropout_p,
-            "fn": ref_attention,
+            "fn": ref_attention_mqa,
         },
         label=f"attention (attn_bias={attn_bias_type})",
         description="eager",

From 3f46c2f4ab1332fccfc1ef5a559b4a5746be3209 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 21:38:18 +0000
Subject: [PATCH 411/641] Rename benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py to
 benchmark_mem_eff_attention_mqa.py

---
 ...tn_mqa_gqa_ck_tiled.py => benchmark_mem_eff_atttention_mqa.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename xformers/benchmarks/{benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py => benchmark_mem_eff_atttention_mqa.py} (100%)

diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
similarity index 100%
rename from xformers/benchmarks/benchmark_mem_eff_attn_mqa_gqa_ck_tiled.py
rename to xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py

From 2c27aacbf10d8dad789669dcf466de28a3fd334c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 22:27:05 +0000
Subject: [PATCH 412/641] Remove the runtime_error with using logsumexp in
 attention_forward_generic_ck_tiled.cpp

---
 .../attention/hip_fmha/attention_forward_generic_ck_tiled.cpp    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index b27626706..0c81dbfa9 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -217,7 +217,6 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
         {
             logsumexp       = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
             p.logsumexp_ptr = logsumexp.data_ptr();
-            throw std::runtime_error("compute logsumexp is currently not implemented by ck-tiled!");
         }
         else
             p.logsumexp_ptr = nullptr;

From 4b8ce7cc0c3e694ba89f0dfe32d320cdef86a4a2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 22:47:01 +0000
Subject: [PATCH 413/641] Add ck-tiled checking in ck.py

---
 xformers/ops/fmha/ck.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 0ecc7f317..fa9ee1f74 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -144,22 +144,25 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
         return int(_CustomMaskType.CausalFromBottomRight)
     return int(_CustomMaskType.NoCustomMask)
 
+# checking the availability of ck-tiled is necessary since ck-tiled does not
+# have the same functionalities as old-CK
+def is_using_ck_tiled() -> bool:
+    ### ck_check_op is temporarily used to check ck-tiled availability
+    ck_check_op = get_xformers_operator("is_ck_tiled_used")
+    use_ck_tiled = ck_check_op()
+    return use_ck_tiled
 
 @register_operator
 class FwOp(AttentionFwOpBase):
     """xFormers' MHA kernel based on Composable Kernel.
     """
 
-    ### ck_check_op is temporarily used to check ck-tiled availability
-    ck_check_op = get_xformers_operator("is_ck_tiled_used")
-    use_ck_tiled = ck_check_op()
- 
     OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 256 
-    
-    if use_ck_tiled: 
+
+    if is_using_ck_tiled(): 
         SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
             type(None),
             torch.Tensor,
@@ -186,7 +189,7 @@ class FwOp(AttentionFwOpBase):
             attn_bias.BlockDiagonalCausalFromBottomRightMask,
          }
 
-    SUPPORTS_DROPOUT = True
+    SUPPORTS_DROPOUT = False if is_using_ck_tiled() else True
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
     SUPPORTS_BMGHK = True
@@ -424,6 +427,8 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                     f"/ expected: {expected_bias_shape})"
                 )
         _check_large_shapes(reasons, d)
+        if is_using_ck_tiled():
+            reasons.append("Backward is currently not completely supported by ck-tiled!")
         return reasons
 
     @classmethod

From 0d311f50f5afe70e16c5ee0ed3e63254493c0895 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 2 Feb 2024 22:49:03 +0000
Subject: [PATCH 414/641] Remove CK-specific benchmark scripts

---
 .../benchmark_mem_eff_attention_ck.py         | 343 ------------------
 .../benchmark_mem_eff_attention_ck_tiled.py   | 316 ----------------
 2 files changed, 659 deletions(-)
 delete mode 100644 xformers/benchmarks/benchmark_mem_eff_attention_ck.py
 delete mode 100644 xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py

diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
deleted file mode 100644
index e683a7f06..000000000
--- a/xformers/benchmarks/benchmark_mem_eff_attention_ck.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import itertools
-import random
-from functools import partial
-
-import torch
-from torch.utils import benchmark
-from xformers.benchmarks.utils import benchmark_main_helper
-
-import xformers.ops
-import xformers.ops.fmha as fmha
-from xformers.attn_bias_utils import create_attn_bias
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-def ref_attention_bmk(q, k, v, attn_bias=None, p=0.0):
-    if isinstance(attn_bias, xformers.ops.AttentionMask):
-        attn_bias = (
-            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
-            .to(q)
-            .squeeze()
-        )
-    q = q * (1.0 / q.shape[-1] ** 0.5)
-    if attn_bias is None:
-        attn = q @ k.transpose(-2, -1)
-    else:
-        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
-        # but faster, and is what is used in PyTorch now
-        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
-    attn = attn.softmax(-1)
-    if p > 0:
-        attn = torch.nn.functional.dropout(attn, p=p)
-    return attn @ v
-
-
-def ref_attention(q, k, v, attn_bias, p=0.0):
-    assert q.ndim == 4
-    B, M, H, K = q.shape
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, torch.Tensor):
-        attn_bias = attn_bias.reshape(B * H, M, M)
-    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias, p)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-min_run_time = 0.5
-device = torch.device("cuda")
-
-NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
-SHAPES = [
-    # ViT
-    (384, 197, 1, 88),
-    (384, 197, 1, 80),
-    (384, 197, 1, 64),
-    (1024, 197, 1, 88),
-    (1024, 197, 1, 80),
-    (1024, 197, 1, 64),
-    # ViT-Huge
-    (32 * 16, 197, 1, 80),
-    (32, 197, 16, 80),
-    (32, 197, 16, 64),
-    (32, 197, 16, 128),
-    # ViT-Giant
-    (16 * 16, 197, 1, 88),
-    (16, 197, 16, 88),
-    (16, 197, 16, 64),
-    (16, 197, 16, 128),
-    # FB models
-    (1024, 82, 8, 64),
-    (150, 256, 16, 64),
-    (64, 256, 12, 64),
-    # Stable diffusion (https://github.com/huggingface/diffusers/pull/532)
-    (1, 4096, 16, 40),  # 512x512
-    (1, 16384, 16, 40),  # 1024x1024
-    (1, 4096, 16, 80),
-    #(1, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
-    # + bs4
-    (4, 4096, 16, 40),
-    #(4, 16384, 16, 40),  // disabled on MI250 due to big memory requirement
-    (4, 4096, 16, 80),
-    #(4, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
-    # ParlAI model
-    #(256, 4096, 16, 64), // disabled on MI250 due to big memory requirement
-    # Zetta B M H K
-    (8, 2048, 20, 128),
-    # LLaMa 70b - mp=8/16
-    *sorted(itertools.product([1, 2], [2048, 4096, 8192], [4, 8], [128])),
-    *sorted(
-        ##itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128, 160, 256])
-        ## disabled K/Kv bigger than 128
-        itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128])
-    ),
-]
-
-OPS = [
-    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
-    #(xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
-    # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
-    # and its performance varies a lot between runs.
-    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
-]
-
-
-def product_dict(**kwargs):
-    keys = kwargs.keys()
-    vals = kwargs.values()
-    for instance in itertools.product(*vals):
-        yield dict(zip(keys, instance))
-
-
-CASES = list(
-    product_dict(
-        shape=SHAPES,
-        num_threads=NUM_THREADS,
-        dropout_p=[0.0],
-        attn_bias_cfg=[(type(None), False)],
-        dtype=[torch.half],
-    )
-)
-
-# Add more cases with some variations
-for c in CASES.copy():
-    c = c.copy()
-    c.update(
-        random.Random(str(c["shape"])).choice(
-            [
-                {"dropout_p": 0.3},
-                {"attn_bias_cfg": (torch.Tensor, False)},
-                {"attn_bias_cfg": (torch.Tensor, True)},
-                {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
-                {
-                    "attn_bias_cfg": (
-                        xformers.ops.fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-                        False,
-                    )
-                },
-                {"dtype": torch.bfloat16},
-                ##{"dtype": torch.float},
-            ]
-        )
-    )
-    CASES.append(c)
-
-
-def create_tensors(shape, dtype, requires_grad=False, packed=True, multiquery=False):
-    stacked_shape = list(shape)  # B, M, H, K
-    stacked_dim = 2 if packed else 0
-    stacked_shape.insert(stacked_dim, 3)
-    qkv = torch.rand(
-        stacked_shape, device=device, dtype=dtype, requires_grad=requires_grad
-    )
-    q = torch.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
-    shape_kv = (shape[0], shape[1], 1 if multiquery else shape[2], shape[3])
-    k = torch.rand(
-        shape_kv, device=device, dtype=dtype, requires_grad=requires_grad
-    ).expand(shape)
-    v = torch.rand(
-        shape_kv, device=device, dtype=dtype, requires_grad=requires_grad
-    ).expand(shape)
-    return qkv, q, k, v
-
-
-def mem_eff_attention_fw(
-    shape,
-    num_threads: int,
-    attn_bias_cfg,
-    dropout_p,
-    dtype,
-    packed=True,
-    multiquery=False,
-):
-    B, M, H, K = shape
-    _, q, k, v = create_tensors(
-        shape, dtype, requires_grad=False, packed=packed, multiquery=multiquery
-    )
-    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
-    if attn_bias_requires_grad:
-        return
-
-    dtype_str = {
-        torch.bfloat16: "b16",
-        torch.half: "f16",
-        torch.float: "f32",
-    }[dtype]
-    sub_label = (
-        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
-        f"BiasT={attn_bias_type.__name__}"
-    )
-
-    has_run = False
-    for fw_op, bw_op in OPS:
-        bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=H,
-            num_heads_groups=1,
-            q_len=M,
-            kv_len=M,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt="BMHK",
-            op=fw_op,
-        )
-        inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
-        if isinstance(
-            bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            q, k, v = [x.reshape([1, -1, *x.shape[2:]]) for x in [q, k, v]]
-        if not fw_op.supports(inp):
-            continue
-
-        yield benchmark.Timer(
-            stmt="fn(q, k, v, attn_bias, p)",
-            globals={
-                "q": q,
-                "k": k,
-                "v": v,
-                "attn_bias": inp.attn_bias,
-                "p": dropout_p,
-                "fn": partial(
-                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
-                ),
-            },
-            label=f"attention (attn_bias={attn_bias_type})",
-            description=fw_op.NAME,
-            sub_label=sub_label,
-            num_threads=num_threads,
-        )
-        has_run = True
-
-    if not has_run:
-        return
-
-    yield benchmark.Timer(
-        stmt="fn(q, k, v, attn_bias, p)",
-        globals={
-            "q": q,
-            "k": k,
-            "v": v,
-            "attn_bias": inp.attn_bias,
-            "p": dropout_p,
-            "fn": ref_attention,
-        },
-        label=f"attention (attn_bias={attn_bias_type})",
-        description="eager",
-        sub_label=sub_label,
-        num_threads=num_threads,
-    )
-
-
-def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
-    B, M, H, K = shape
-    qkv, q, k, v = create_tensors(shape, dtype, requires_grad=True)
-
-    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
-
-    dtype_str = {
-        torch.bfloat16: "b16",
-        torch.half: "f16",
-        torch.float: "f32",
-    }[dtype]
-    sub_label = (
-        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
-        f"BiasT={attn_bias_type.__name__}, BiasGrad={attn_bias_requires_grad}"
-    )
-
-    has_run = False
-    for fw_op, bw_op in OPS:
-        bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=H,
-            num_heads_groups=1,
-            q_len=M,
-            kv_len=M,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt="BMHK",
-            op=bw_op,
-        )
-        inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
-
-        if not fw_op.supports(inp) or not bw_op.supports(inp):
-            continue
-        has_run = True
-        out = xformers.ops.memory_efficient_attention(
-            inp.query, inp.key, inp.value, inp.attn_bias, inp.p, op=(fw_op, bw_op)
-        )
-        grad_benchmark = torch.ones_like(q)
-
-        yield benchmark.Timer(
-            stmt="out.backward(grad, retain_graph=True)",
-            globals={
-                "out": out,
-                "grad": grad_benchmark,
-            },
-            label=f"attention backward (attn_bias={attn_bias_type})",
-            description=bw_op.NAME,
-            sub_label=sub_label,
-            num_threads=num_threads,
-        )
-        del out
-
-    if not has_run:
-        return
-    yield benchmark.Timer(
-        stmt="out.backward(grad, retain_graph=True)",
-        globals={
-            "out": ref_attention(q, k, v, inp.attn_bias, dropout_p),
-            "grad": grad_benchmark,
-        },
-        label=f"attention backward (attn_bias={attn_bias_type})",
-        description="vanilla",
-        sub_label=sub_label,
-        num_threads=num_threads,
-    )
-
-
-def main():
-    benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
-    benchmark_main_helper(mem_eff_attention_bw, CASES, min_run_time=min_run_time)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py b/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
deleted file mode 100644
index ee0c111ff..000000000
--- a/xformers/benchmarks/benchmark_mem_eff_attention_ck_tiled.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import itertools
-import random
-from functools import partial
-
-import torch
-from torch.utils import benchmark
-from xformers.benchmarks.utils import benchmark_main_helper
-
-import xformers.ops
-import xformers.ops.fmha as fmha
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    bias_requires_grad: bool = False,
-):
-    NoneType = type(None)
-    if bias_type is NoneType:
-        return None
-    if bias_type is torch.Tensor:
-        attn_bias = torch.randn((1, 1, q_len, kv_len), device=device, dtype=dtype)
-        return attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return bias_type()
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
-def ref_attention_bmk(q, k, v, attn_bias=None, p=0.0):
-    if isinstance(attn_bias, xformers.ops.AttentionMask):
-        attn_bias = (
-            attn_bias.materialize((q.shape[0], 1, q.shape[1], k.shape[1]))
-            .to(q)
-            .squeeze()
-        )
-    q = q * (1.0 / q.shape[-1] ** 0.5)
-    if attn_bias is None:
-        attn = q @ k.transpose(-2, -1)
-    else:
-        # equivalent to (q @ k.transpose(-2, -1) + m).softmax(-1) @ v
-        # but faster, and is what is used in PyTorch now
-        attn = torch.baddbmm(attn_bias, q, k.transpose(-2, -1))
-    attn = attn.softmax(-1)
-    if p > 0:
-        attn = torch.nn.functional.dropout(attn, p=p)
-    return attn @ v
-
-
-def ref_attention(q, k, v, attn_bias, p=0.0):
-    assert q.ndim == 4
-    B, M, H, K = q.shape
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, torch.Tensor):
-        attn_bias = attn_bias.reshape(B * H, M, M)
-    out = ref_attention_bmk(T(q), T(k), T(v), attn_bias, p)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-min_run_time = 0.5
-device = torch.device("cuda")
-
-NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
-SHAPES = [
-    # ViT
-    ##(384, 197, 1, 88),
-    ##(384, 197, 1, 80),
-    (384, 197, 1, 64),
-    ##(1024, 197, 1, 88),
-    ##(1024, 197, 1, 80),
-    (1024, 197, 1, 64),
-    # ViT-Huge
-    ##(32 * 16, 197, 1, 80),
-    ##(32, 197, 16, 80),
-    (32, 197, 16, 64),
-    (32, 197, 16, 128),
-    # ViT-Giant
-    ##(16 * 16, 197, 1, 88),
-    ##(16, 197, 16, 88),
-    (16, 197, 16, 64),
-    (16, 197, 16, 128),
-    # FB models
-    (1024, 82, 8, 64),
-    (150, 256, 16, 64),
-    (64, 256, 12, 64),
-    # Stable diffusion (https://github.com/huggingface/diffusers/pull/532)
-    ##(1, 4096, 16, 40),  # 512x512
-    ##(1, 16384, 16, 40),  # 1024x1024
-    ##(1, 4096, 16, 80),
-    #(1, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
-    # + bs4
-    ##(4, 4096, 16, 40),
-    #(4, 16384, 16, 40),  // disabled on MI250 due to big memory requirement
-    ##(4, 4096, 16, 80),
-    #(4, 16384, 16, 80),  // disabled on MI250 due to big memory requirement
-    # ParlAI model
-    #(256, 4096, 16, 64), // disabled on MI250 due to big memory requirement
-    # Zetta B M H K
-    (8, 2048, 20, 128),
-    # LLaMa 70b - mp=8/16
-    *sorted(itertools.product([1, 2], [2048, 4096, 8192], [4, 8], [128])),
-    *sorted(
-        ##itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128, 160, 256])
-        ## disabled K/Kv bigger than 128
-        itertools.product([16], [128, 512, 1024], [16], [64, 128])
-    ),
-]
-
-OPS = [
-    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
-    #(xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
-    # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
-    # and its performance varies a lot between runs.
-    # (xformers.ops.fmha.triton.FwOp, xformers.ops.fmha.triton.BwOp),
-]
-
-
-def product_dict(**kwargs):
-    keys = kwargs.keys()
-    vals = kwargs.values()
-    for instance in itertools.product(*vals):
-        yield dict(zip(keys, instance))
-
-
-CASES = list(
-    product_dict(
-        shape=SHAPES,
-        num_threads=NUM_THREADS,
-        dropout_p=[0.0],
-        attn_bias_cfg=[(type(None), False)],
-        dtype=[torch.half],
-    )
-)
-
-# Add more cases with some variations
-for c in CASES.copy():
-    c = c.copy()
-    c.update(
-        random.Random(str(c["shape"])).choice(
-            [
-                ##{"dropout_p": 0.3},
-                {"attn_bias_cfg": (torch.Tensor, False)},
-                ##{"attn_bias_cfg": (torch.Tensor, True)},
-                {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
-                ##{"dtype": torch.bfloat16},
-                ##{"dtype": torch.float},
-            ]
-        )
-    )
-    CASES.append(c)
-
-
-def create_tensors(shape, dtype, requires_grad=False):
-    B, M, H, K = shape
-    qkv = torch.rand(
-        [B, M, 3, H, K], device=device, dtype=dtype, requires_grad=requires_grad
-    )
-    q, k, v = xformers.ops.unbind(qkv, 2)
-    return qkv, q, k, v
-
-def mem_eff_attention_fw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
-    B, M, H, K = shape
-    _, q, k, v = create_tensors(shape, dtype)
-    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
-    if attn_bias_requires_grad:
-        return
-    bias = create_attn_bias(
-        attn_bias_type,
-        batch_size=B,
-        num_heads=H,
-        q_len=M,
-        kv_len=M,
-        device=device,
-        dtype=dtype,
-        bias_requires_grad=attn_bias_requires_grad,
-    )
-    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
-
-    dtype_str = {
-        torch.bfloat16: "b16",
-        torch.half: "f16",
-        torch.float: "f32",
-    }[dtype]
-    sub_label = (
-        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
-        f"BiasT={attn_bias_type.__name__}"
-    )
-
-    has_run = False
-    for fw_op, bw_op in OPS:
-        if not fw_op.supports(inp):
-            continue
-
-        yield benchmark.Timer(
-            stmt="fn(q, k, v, attn_bias, p)",
-            globals={
-                "q": q,
-                "k": k,
-                "v": v,
-                "attn_bias": inp.attn_bias,
-                "p": dropout_p,
-                "fn": partial(
-                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
-                ),
-            },
-            label=f"attention (attn_bias={attn_bias_type})",
-            description=fw_op.NAME,
-            sub_label=sub_label,
-            num_threads=num_threads,
-        )
-        has_run = True
-
-    if not has_run:
-        return
-
-    yield benchmark.Timer(
-        stmt="fn(q, k, v, attn_bias, p)",
-        globals={
-            "q": q,
-            "k": k,
-            "v": v,
-            "attn_bias": inp.attn_bias,
-            "p": dropout_p,
-            "fn": ref_attention,
-        },
-        label=f"attention (attn_bias={attn_bias_type})",
-        description="eager",
-        sub_label=sub_label,
-        num_threads=num_threads,
-    )
-
-
-def mem_eff_attention_bw(shape, num_threads: int, attn_bias_cfg, dropout_p, dtype):
-    B, M, H, K = shape
-    _, q, k, v = create_tensors(shape, dtype, requires_grad=True)
-
-    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
-    bias = create_attn_bias(
-        attn_bias_type,
-        batch_size=B,
-        num_heads=H,
-        q_len=M,
-        kv_len=M,
-        device=device,
-        dtype=dtype,
-        bias_requires_grad=attn_bias_requires_grad,
-    )
-    inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
-
-    dtype_str = {
-        torch.bfloat16: "b16",
-        torch.half: "f16",
-        torch.float: "f32",
-    }[dtype]
-    sub_label = (
-        f"{dtype_str} {B}-{M}-{H}-{K}, p={dropout_p}, "
-        f"BiasT={attn_bias_type.__name__}, BiasGrad={attn_bias_requires_grad}"
-    )
-
-    has_run = False
-    for fw_op, bw_op in OPS:
-        if not fw_op.supports(inp) or not bw_op.supports(inp):
-            continue
-        has_run = True
-        out = xformers.ops.memory_efficient_attention(
-            inp.query, inp.key, inp.value, inp.attn_bias, inp.p, op=(fw_op, bw_op)
-        )
-        grad_benchmark = torch.ones_like(q)
-
-        yield benchmark.Timer(
-            stmt="out.backward(grad, retain_graph=True)",
-            globals={
-                "out": out,
-                "grad": grad_benchmark,
-            },
-            label=f"attention backward (attn_bias={attn_bias_type})",
-            description=bw_op.NAME,
-            sub_label=sub_label,
-            num_threads=num_threads,
-        )
-        del out
-
-    if not has_run:
-        return
-    yield benchmark.Timer(
-        stmt="out.backward(grad, retain_graph=True)",
-        globals={
-            "out": ref_attention(q, k, v, inp.attn_bias, dropout_p),
-            "grad": grad_benchmark,
-        },
-        label=f"attention backward (attn_bias={attn_bias_type})",
-        description="vanilla",
-        sub_label=sub_label,
-        num_threads=num_threads,
-    )
-
-benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
-##benchmark_main_helper(mem_eff_attention_bw, CASES, min_run_time=min_run_time)

From d57a5dba2b772ab134805c083d9de7ea3e3a1d55 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 3 Feb 2024 20:24:32 +0000
Subject: [PATCH 415/641] Don't require is_cpu_tensor for
 seqstart_q/seqstart_k/seqlen_k in attention_forward_generic_ck_tiled

---
 .../attention_forward_generic_ck_tiled.cpp    | 68 ++++++++++++-------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 0c81dbfa9..9db1cd257 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -85,8 +85,6 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
         TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
         TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
         TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
         TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
         TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
         TORCH_CHECK(max_seqlen_q_.has_value());
@@ -281,40 +279,58 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
         // max_seqlen_q is used to create logsumexp tensor
         p.max_seqlen_q = *max_seqlen_q_;
 
-        at::Tensor dev_seqstart_q = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-        at::Tensor dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+        // interesting: the tensors have to be defined here, moving to more local scope will
+        // cause issue
+        at::Tensor dev_seqstart_q;
+        at::Tensor dev_seqstart_k;
         at::Tensor dev_seqlen_k;
 
-        p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
-        HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_q_dev_ptr,
-                                      seqstart_q->data_ptr(),
-                                      (p.num_batches + 1) * sizeof(int),
-                                      hipMemcpyHostToDevice,
-                                      stream));
+        if(seqstart_q->is_cpu())
+        {
+            dev_seqstart_q       = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+            p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
+            HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_q_dev_ptr,
+                                          seqstart_q->data_ptr(),
+                                          (p.num_batches + 1) * sizeof(int),
+                                          hipMemcpyHostToDevice,
+                                          stream));
+        }
+        else
+            p.seqstart_q_dev_ptr = seqstart_q->data_ptr();
 
-        p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
-        HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_k_dev_ptr,
-                                      seqstart_k->data_ptr(),
-                                      (p.num_batches + 1) * sizeof(int),
-                                      hipMemcpyHostToDevice,
-                                      stream));
+        if(seqstart_k->is_cpu())
+        {
+            dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+
+            p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
+            HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_k_dev_ptr,
+                                          seqstart_k->data_ptr(),
+                                          (p.num_batches + 1) * sizeof(int),
+                                          hipMemcpyHostToDevice,
+                                          stream));
+        }
+        else
+            p.seqstart_k_dev_ptr = seqstart_k->data_ptr();
 
         if(seqlen_k.has_value())
         {
             TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
             TORCH_CHECK(seqlen_k->dim() == 1);
             TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-            CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
-
-            dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
 
-            p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
-
-            HIP_CALL_CHECK(hipMemcpyAsync(p.seqlen_k_dev_ptr,
-                                          seqlen_k->data_ptr(),
-                                          p.num_batches * sizeof(int),
-                                          hipMemcpyHostToDevice,
-                                          stream));
+            if(seqlen_k->is_cpu())
+            {
+                dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
+
+                p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
+                HIP_CALL_CHECK(hipMemcpyAsync(p.seqlen_k_dev_ptr,
+                                              seqlen_k->data_ptr(),
+                                              p.num_batches * sizeof(int),
+                                              hipMemcpyHostToDevice,
+                                              stream));
+            }
+            else
+                p.seqlen_k_dev_ptr = seqlen_k->data_ptr();
         }
         else
             p.seqlen_k_dev_ptr = nullptr;

From b25c2391804fcc22af8f23d85239c6e0b2cd196c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 3 Feb 2024 21:07:39 +0000
Subject: [PATCH 416/641] Remove seqlen_cpu from  _PaddedSeqLenInfo in
 attn_bias.py

---
 xformers/ops/fmha/attn_bias.py |  2 --
 xformers/ops/fmha/ck.py        | 15 ++++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/xformers/ops/fmha/attn_bias.py b/xformers/ops/fmha/attn_bias.py
index 2fa591c30..5a453ebb5 100644
--- a/xformers/ops/fmha/attn_bias.py
+++ b/xformers/ops/fmha/attn_bias.py
@@ -408,7 +408,6 @@ class _PaddedSeqLenInfo(_SeqLenInfo):
     """
 
     seqlen: torch.Tensor
-    seqlen_cpu: torch.Tensor
     seqlen_py: Sequence[int]
     padding: int
     # From parent: seqstart[i] contains the start position
@@ -446,7 +445,6 @@ def from_seqlens_padded(
         seqlen = torch.tensor(seqlens, dtype=torch.int32)
         return cls(
             seqlen=seqlen,
-            seqlen_cpu=seqlen.to(device=torch.device("cpu")) if torch.cuda.is_available() and torch.version.hip else None,
             seqlen_py=seqlens,
             max_seqlen=max(seqlens),
             min_seqlen=min(seqlens),
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index fa9ee1f74..2b031f143 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -146,11 +146,10 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
 
 # checking the availability of ck-tiled is necessary since ck-tiled does not
 # have the same functionalities as old-CK
-def is_using_ck_tiled() -> bool:
+def is_ck_tiled() -> bool:
     ### ck_check_op is temporarily used to check ck-tiled availability
     ck_check_op = get_xformers_operator("is_ck_tiled_used")
-    use_ck_tiled = ck_check_op()
-    return use_ck_tiled
+    return ck_check_op()
 
 @register_operator
 class FwOp(AttentionFwOpBase):
@@ -162,7 +161,7 @@ class FwOp(AttentionFwOpBase):
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 256 
 
-    if is_using_ck_tiled(): 
+    if is_ck_tiled(): 
         SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
             type(None),
             torch.Tensor,
@@ -189,7 +188,7 @@ class FwOp(AttentionFwOpBase):
             attn_bias.BlockDiagonalCausalFromBottomRightMask,
          }
 
-    SUPPORTS_DROPOUT = False if is_using_ck_tiled() else True
+    SUPPORTS_DROPOUT = False if is_ck_tiled() else True
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
     SUPPORTS_BMGHK = True
@@ -283,6 +282,8 @@ def apply_bmhk(
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
         seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
+        if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            seqlen_k=inp.attn_bias.k_seqinfo.seqlen if is_ck_tiled() else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
         out, lse, rng_seed, rng_offset = cls.OPERATOR(
             query=inp.query,
             key=inp.key,
@@ -295,7 +296,7 @@ def apply_bmhk(
             compute_logsumexp=needs_gradient,
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
-            seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
+            seqlen_k=seqlen_k
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
             window_size=inp.attn_bias._window_size
@@ -427,7 +428,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                     f"/ expected: {expected_bias_shape})"
                 )
         _check_large_shapes(reasons, d)
-        if is_using_ck_tiled():
+        if is_ck_tiled():
             reasons.append("Backward is currently not completely supported by ck-tiled!")
         return reasons
 

From 1a3ce52424fb7d93c1cbbe92a9ae4f3bbf98288e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 4 Feb 2024 15:24:11 +0000
Subject: [PATCH 417/641] Change the branch for composable_kernel_tiled
 submodule and update to latest

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 41a2922cb..cbef796c7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -11,4 +11,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = ck_tile/fmha_attemp_async_copy_unify
+	branch = ck_tile/dev
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index eb53e235c..3bda955fe 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit eb53e235c76e3da0374214221e94c45419b90bec
+Subproject commit 3bda955fe6ca92cdd29691783ebb772ac13c857c

From f7bf9b4d0ef203234724247d4bc1bda1a03ff0c6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 4 Feb 2024 17:07:59 +0000
Subject: [PATCH 418/641] Remove the using of seqlen_cpu in BwOp of ck.py

---
 xformers/ops/fmha/ck.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 2b031f143..ff899dc53 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -440,6 +440,9 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
+        if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            seqlen_k=inp.attn_bias.k_seqinfo.seqlen if is_ck_tiled() else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
+
         rng_seed = rng_offset = 0
         if inp.p != 0.0:
             if (
@@ -460,7 +463,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
             max_seqlen_q=max_seqlen_q,
-            seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
+            seqlen_k=seqlen_k
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
             logsumexp=ctx.lse,

From 15d2a720df2ab6414460a7255e49cff76e3a06b1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 4 Feb 2024 17:07:59 +0000
Subject: [PATCH 419/641] Remove the using of seqlen_cpu in BwOp of ck.py

---
 xformers/ops/fmha/ck.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 2b031f143..ff899dc53 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -440,6 +440,9 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
+        if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            seqlen_k=inp.attn_bias.k_seqinfo.seqlen if is_ck_tiled() else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
+
         rng_seed = rng_offset = 0
         if inp.p != 0.0:
             if (
@@ -460,7 +463,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
             max_seqlen_q=max_seqlen_q,
-            seqlen_k=inp.attn_bias.k_seqinfo.seqlen_cpu
+            seqlen_k=seqlen_k
             if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             else None,
             logsumexp=ctx.lse,

From bcd193656ddc35932a948ccaaab33423c0d2239e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 4 Feb 2024 17:30:41 +0000
Subject: [PATCH 420/641] Align .clang_format with main branch and re-format
 c++ files

---
 .clang-format                                 |   80 +-
 xformers/csrc/attention/attention.cpp         |   59 +-
 .../hip_fmha/attention_backward_generic.cpp   |  970 ++++----
 .../hip_fmha/attention_ck_rand_uniform.cpp    |  173 +-
 .../hip_fmha/attention_forward_decoder.cpp    |  464 ++--
 .../hip_fmha/attention_forward_generic.cpp    |  725 +++---
 .../attention_forward_generic_ck_tiled.cpp    |  744 +++---
 .../hip_fmha/attention_forward_splitk.cpp     | 1998 +++++++++--------
 .../csrc/attention/hip_fmha/ck_align_switch.h |  292 ++-
 .../hip_fmha/ck_attention_forward_decoder.h   |  886 ++++----
 .../ck_attention_forward_decoder_splitk.h     | 1238 +++++-----
 .../csrc/attention/hip_fmha/ck_bool_switch.h  |   44 +-
 .../ck_fmha_backward_gemm_constants.h         |  344 ++-
 .../hip_fmha/ck_fmha_batched_backward.h       |  657 +++---
 .../ck_fmha_batched_backward_bp16.cpp         |  137 +-
 .../ck_fmha_batched_backward_fp16.cpp         |  134 +-
 .../hip_fmha/ck_fmha_batched_forward.h        |  515 ++---
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp |   89 +-
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp |   89 +-
 .../hip_fmha/ck_fmha_batched_infer.h          |  483 ++--
 .../hip_fmha/ck_fmha_batched_infer_bp16.cpp   |   89 +-
 .../hip_fmha/ck_fmha_batched_infer_fp16.cpp   |   89 +-
 .../hip_fmha/ck_fmha_common_gemm_constants.h  |   27 +-
 .../hip_fmha/ck_fmha_grouped_backward.h       |  673 +++---
 .../ck_fmha_grouped_backward_bp16.cpp         |  143 +-
 .../ck_fmha_grouped_backward_fp16.cpp         |  140 +-
 .../hip_fmha/ck_fmha_grouped_forward.h        |  528 +++--
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp |   89 +-
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp |   89 +-
 .../hip_fmha/ck_fmha_grouped_infer.h          |  503 ++---
 .../hip_fmha/ck_fmha_grouped_infer_bp16.cpp   |   89 +-
 .../hip_fmha/ck_fmha_grouped_infer_fp16.cpp   |   89 +-
 .../attention/hip_fmha/ck_fmha_op_helper.h    |   39 +-
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  376 ++--
 .../csrc/attention/hip_fmha/ck_fmha_test.cpp  |   30 +-
 .../csrc/attention/hip_fmha/ck_fmha_util.h    |  218 +-
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  |  375 ++--
 .../ck_tiled_fmha_batched_forward_bp16.cpp    |   35 +-
 .../ck_tiled_fmha_batched_forward_fp16.cpp    |   35 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  375 ++--
 .../ck_tiled_fmha_batched_infer_bp16.cpp      |   35 +-
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |   35 +-
 .../hip_fmha/ck_tiled_fmha_definitions.h      |  139 +-
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 1238 +++++-----
 .../hip_fmha/ck_tiled_fmha_fwd_epilogue.h     |   40 +-
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |   87 +-
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  306 +--
 .../ck_tiled_fmha_grouped_forward_bp16.cpp    |   35 +-
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    |   35 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  306 +--
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      |   35 +-
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |   35 +-
 .../attention/hip_fmha/ck_tiled_fmha_params.h |  366 ++-
 .../hip_fmha/ck_tiled_headdim_switch.h        |   43 +-
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._bp16_masktype_0_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_bp16_masktype_0_with_attnbias.cpp |    7 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._bp16_masktype_1_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_bp16_masktype_1_with_attnbias.cpp |    7 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._bp16_masktype_2_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_bp16_masktype_2_with_attnbias.cpp |    7 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._fp16_masktype_0_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_fp16_masktype_0_with_attnbias.cpp |    7 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._fp16_masktype_1_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_fp16_masktype_1_with_attnbias.cpp |    7 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._fp16_masktype_2_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_fp16_masktype_2_with_attnbias.cpp |    7 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |    7 +-
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._forward_bp16_masktype_0_with_attnbias.cpp |    7 +-
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._forward_bp16_masktype_1_with_attnbias.cpp |    7 +-
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._forward_bp16_masktype_2_with_attnbias.cpp |    7 +-
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._forward_fp16_masktype_0_with_attnbias.cpp |    7 +-
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._forward_fp16_masktype_1_with_attnbias.cpp |    7 +-
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._forward_fp16_masktype_2_with_attnbias.cpp |    7 +-
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |    7 +-
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |    7 +-
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |    7 +-
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |    7 +-
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |    7 +-
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |    7 +-
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |    7 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |    7 +-
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |    7 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |    7 +-
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |    7 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |    7 +-
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._bp16_masktype_0_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_bp16_masktype_0_with_attnbias.cpp |    7 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._bp16_masktype_1_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_bp16_masktype_1_with_attnbias.cpp |    7 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._bp16_masktype_2_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_bp16_masktype_2_with_attnbias.cpp |    7 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._fp16_masktype_0_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_fp16_masktype_0_with_attnbias.cpp |    7 +-
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._fp16_masktype_1_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_fp16_masktype_1_with_attnbias.cpp |    7 +-
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |    7 +-
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._fp16_masktype_2_no_attnbias_fp32_grad.cpp |    7 +-
 ...backward_fp16_masktype_2_with_attnbias.cpp |    7 +-
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |    7 +-
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._forward_bp16_masktype_0_with_attnbias.cpp |    7 +-
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._forward_bp16_masktype_1_with_attnbias.cpp |    7 +-
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._forward_bp16_masktype_2_with_attnbias.cpp |    7 +-
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |    7 +-
 ..._forward_fp16_masktype_0_with_attnbias.cpp |    7 +-
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |    7 +-
 ..._forward_fp16_masktype_1_with_attnbias.cpp |    7 +-
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |    7 +-
 ..._forward_fp16_masktype_2_with_attnbias.cpp |    7 +-
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |    7 +-
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |    7 +-
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |    7 +-
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |    7 +-
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |    7 +-
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |    7 +-
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |    7 +-
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |    7 +-
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |    7 +-
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |    7 +-
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |    7 +-
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ..._no_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...6_no_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...o_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...no_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_128.cpp |    7 +-
 ...ith_causalmask_no_attnbias_headdim_256.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_32.cpp |    7 +-
 ...with_causalmask_no_attnbias_headdim_64.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_128.cpp |    7 +-
 ...h_causalmask_with_attnbias_headdim_256.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_32.cpp |    7 +-
 ...th_causalmask_with_attnbias_headdim_64.cpp |    7 +-
 278 files changed, 9661 insertions(+), 8794 deletions(-)

diff --git a/.clang-format b/.clang-format
index 22f267496..6d0ab740d 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,81 +1,80 @@
 ---
-Language:        Cpp
-AccessModifierOffset: 0
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: true
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlinesLeft: true
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: true
 BinPackArguments: false
 BinPackParameters: false
-BraceWrapping:   
-  AfterClass:      true
-  AfterControlStatement: true
-  AfterEnum:       true
-  AfterFunction:   true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
   AfterNamespace:  false
-  AfterObjCDeclaration: true
-  AfterStruct:     true
-  AfterUnion:      true
-  BeforeCatch:     true
-  BeforeElse:      true
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
   IndentBraces:    false
 BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Custom
+BreakBeforeBraces: Attach
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
-ColumnLimit:     100
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
+#CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IncludeCategories: 
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
     Priority:        2
-  - Regex:           '^(<|"(gtest|isl|json)/)'
-    Priority:        3
   - Regex:           '.*'
-    Priority:        1
-IndentCaseLabels: false
-IndentWidth:     4
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
 IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: true
+KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakBeforeFirstCallParameter: 19
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyReturnTypeOnItsOwnLine: 2000000
 PointerAlignment: Left
 ReflowComments:  true
-SortIncludes:    false
+SortIncludes:    true
 SpaceAfterCStyleCast: false
-# SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: Never
+SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 1
 SpacesInAngles:  false
@@ -87,4 +86,3 @@ Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
 ...
-
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index e798bc61d..36a9675e7 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -7,42 +7,43 @@
  */
 #include <torch/types.h>
 
-TORCH_LIBRARY_FRAGMENT(xformers, m)
-{
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
 #if !defined(USE_ROCM)
-    m.def(TORCH_SELECTIVE_SCHEMA(
-        "xformers::efficient_attention_forward_small_k(Tensor query, Tensor key, Tensor value, "
-        "bool compute_logsumexp, Tensor? attn_bias, float p) -> (Tensor, Tensor, int, int)"));
-    m.def(TORCH_SELECTIVE_SCHEMA(
-        "xformers::efficient_attention_forward_cutlass(Tensor query, Tensor key, Tensor value, "
-        "Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float "
-        "dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, "
-        "int? window_size) -> (Tensor, Tensor, int, int)"));
-    m.def(
-        TORCH_SELECTIVE_SCHEMA("xformers::efficient_attention_forward_decoder(Tensor query, Tensor "
-                               "key, Tensor value, Tensor seq_positions, float scale) -> Tensor"));
-    m.def(TORCH_SELECTIVE_SCHEMA(
-        "xformers::efficient_attention_backward_small_k(Tensor grad_out, Tensor query, Tensor key, "
-        "Tensor value, Tensor logsumexp, Tensor output, Tensor? attn_bias, float p, int rng_seed, "
-        "int rng_offset) -> (Tensor, Tensor, Tensor)"));
-    m.def(TORCH_SELECTIVE_SCHEMA(
-        "xformers::efficient_attention_backward_cutlass(Tensor grad_out, Tensor query, Tensor key, "
-        "Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_q, "
-        "int max_seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int "
-        "rng_offset, int custom_mask_type, float? scale, int num_splits_key, int? window_size) -> "
-        "(Tensor, Tensor, Tensor, Tensor)"));
-    m.def(TORCH_SELECTIVE_SCHEMA("xformers::_temp_dropout(Tensor out, float p) -> Tensor"));
-    m.def(TORCH_SELECTIVE_SCHEMA("xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_small_k(Tensor query, Tensor key, Tensor value, "
+      "bool compute_logsumexp, Tensor? attn_bias, float p) -> (Tensor, Tensor, int, int)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_cutlass(Tensor query, Tensor key, Tensor value, "
+      "Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, float "
+      "dropout_p, bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, "
+      "int? window_size) -> (Tensor, Tensor, int, int)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_decoder(Tensor query, Tensor "
+      "key, Tensor value, Tensor seq_positions, float scale) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_backward_small_k(Tensor grad_out, Tensor query, Tensor key, "
+      "Tensor value, Tensor logsumexp, Tensor output, Tensor? attn_bias, float p, int rng_seed, "
+      "int rng_offset) -> (Tensor, Tensor, Tensor)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_backward_cutlass(Tensor grad_out, Tensor query, Tensor key, "
+      "Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_q, "
+      "int max_seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int "
+      "rng_offset, int custom_mask_type, float? scale, int num_splits_key, int? window_size) -> "
+      "(Tensor, Tensor, Tensor, Tensor)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::_temp_dropout(Tensor out, float p) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::_cutlass_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 #if defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
-          "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
-          "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
+      "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
+      "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
+      "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
-          "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
+      "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, "
       " Tensor value, Tensor? seq_positions, float scale, int split_k) -> Tensor"));
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
index 282b9aabd..4a4a06d71 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
@@ -17,14 +17,23 @@
 #include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
-extern void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream);
-extern void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream);
-extern void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream);
-extern void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream);
+extern void batched_backward_fp16(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+extern void batched_backward_bp16(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+extern void grouped_backward_fp16(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
+extern void grouped_backward_bp16(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
 
 namespace {
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> efficient_attention_backward_ck(
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+efficient_attention_backward_ck(
     const at::Tensor& grad_out,
     const at::Tensor& query,
     const at::Tensor& key,
@@ -41,527 +50,524 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> efficient_attention_b
     const c10::optional<at::Tensor>& seqlen_k,
     const at::Tensor& logsumexp,
     const at::Tensor& out,
-    double dropout_p,   // dropout probability
-    int64_t rng_seed,   // seed using for generating random numbers for dropout
+    double dropout_p, // dropout probability
+    int64_t rng_seed, // seed using for generating random numbers for dropout
     int64_t rng_offset, // offset into random number sequence
     int64_t custom_mask_type,
-    const c10::optional<double> scale)
-{
+    const c10::optional<double> scale) {
 #ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
-    TORCH_CHECK(false,
-                "MemoryEfficient build has been disabled at build time with "
-                "-DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
+  TORCH_CHECK(
+      false,
+      "MemoryEfficient build has been disabled at build time with "
+      "-DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
 #else
-    at::globalContext().alertNotDeterministic("mem_efficient_attention_backward_cutlass");
-
-    // ndim
-    TORCH_CHECK(query.dim() == grad_out.dim());
-    TORCH_CHECK(query.dim() == key.dim());
-    TORCH_CHECK(query.dim() == value.dim());
-    TORCH_CHECK(query.dim() == 4);
-
-    // batch size
-    TORCH_CHECK(query.size(0) == grad_out.size(0));
-    TORCH_CHECK(query.size(0) == key.size(0));
-    TORCH_CHECK(query.size(0) == value.size(0));
-
-    // seqlen
-    TORCH_CHECK(key.size(1) == value.size(1));
-    TORCH_CHECK(query.size(1) == grad_out.size(1));
-
-    // Num heads
-    TORCH_CHECK(query.size(2) % key.size(2) == 0);
-    TORCH_CHECK(key.size(2) == value.size(2));
-    TORCH_CHECK(query.size(2) == grad_out.size(2));
-
-    // Embedding per head
-    TORCH_CHECK(query.size(3) == key.size(3));
-    TORCH_CHECK(value.size(3) == grad_out.size(3));
-
-    // CK-FlashAttn requires out, grad_out to have same shapes
-    TORCH_CHECK(out.sizes() == grad_out.sizes());
-    TORCH_CHECK(out.strides() == grad_out.strides());
-
-    // last dim is contiguous, device is CUDA
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-    // logsumexp should be completely contiguous
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
-
-    TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-    TORCH_CHECK(!(seqstart_q.has_value() && bias.has_value()), "seqstart_q + bias not supported");
-
-    if(seqstart_q.has_value())
-    {
-        TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-        TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-        TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-        TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-        TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
-        TORCH_CHECK(max_seqlen_q_.has_value());
-    }
-
-    bool use_fp32_qkv_grad = false;
-
-    if(const char* env_str = std::getenv("USE_FP32_QKV_GRAD"))
-    {
-        use_fp32_qkv_grad = (std::stoi(env_str) > 0) ? true : false;
-    };
+  at::globalContext().alertNotDeterministic(
+      "mem_efficient_attention_backward_cutlass");
+
+  // ndim
+  TORCH_CHECK(query.dim() == grad_out.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == value.dim());
+  TORCH_CHECK(query.dim() == 4);
+
+  // batch size
+  TORCH_CHECK(query.size(0) == grad_out.size(0));
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // seqlen
+  TORCH_CHECK(key.size(1) == value.size(1));
+  TORCH_CHECK(query.size(1) == grad_out.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) == grad_out.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+  TORCH_CHECK(value.size(3) == grad_out.size(3));
+
+  // CK-FlashAttn requires out, grad_out to have same shapes
+  TORCH_CHECK(out.sizes() == grad_out.sizes());
+  TORCH_CHECK(out.strides() == grad_out.strides());
+
+  // last dim is contiguous, device is CUDA
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  // logsumexp should be completely contiguous
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  TORCH_CHECK(
+      !(seqstart_q.has_value() && bias.has_value()),
+      "seqstart_q + bias not supported");
+
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+  }
+
+  bool use_fp32_qkv_grad = false;
+
+  if (const char* env_str = std::getenv("USE_FP32_QKV_GRAD")) {
+    use_fp32_qkv_grad = (std::stoi(env_str) > 0) ? true : false;
+  };
+
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t Hq = query.size(2);
+  int64_t Hkv = key.size(2);
+  int64_t K = query.size(3);
+  int64_t Kv = value.size(3);
+
+  auto opts = query.options();
+
+  at::Tensor grad_q, grad_k, grad_v, grad_bias;
+
+  if (query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
+      query.size(2) == key.size(2) &&
+      query.storage().is_alias_of(key.storage()) &&
+      query.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk for grad_q, grad_k, grad_v
+    // This is because q, k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk;
+    if (use_fp32_qkv_grad)
+      chunk = at::empty({B, M, 3, Hq, K}, opts.dtype(at::kFloat));
+    else
+      chunk = at::empty({B, M, 3, Hq, K}, opts);
+    grad_q = chunk.select(2, 0);
+    grad_k = chunk.select(2, 1);
+    grad_v = chunk.select(2, 2);
+    grad_q.fill_(0);
+  } else if (
+      key.size(3) == value.size(3) &&
+      key.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk for grad_k, grad_v
+    // This is because k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk;
+    if (use_fp32_qkv_grad)
+      chunk = at::empty({B, N, 2, Hkv, Kv}, opts.dtype(at::kFloat));
+    else
+      chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
+    grad_k = chunk.select(2, 0);
+    grad_v = chunk.select(2, 1);
 
-    // at::cuda::CUDAGuard device_guard(query.device());
-    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-    int64_t B   = query.size(0);
-    int64_t M   = query.size(1);
-    int64_t N   = key.size(1);
-    int64_t Hq  = query.size(2);
-    int64_t Hkv = key.size(2);
-    int64_t K   = query.size(3);
-    int64_t Kv  = value.size(3);
-
-    auto opts = query.options();
-
-    at::Tensor grad_q, grad_k, grad_v, grad_bias;
-
-    if(query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
-       query.size(2) == key.size(2) && query.storage().is_alias_of(key.storage()) &&
-       query.storage().is_alias_of(value.storage()))
-    {
-        // Create one big contiguous chunk for grad_q, grad_k, grad_v
-        // This is because q, k and v usually come from a single
-        // output of a linear layer that is chunked.
-        // Creating the gradients with the right layout saves us
-        // a `torch.cat` call in the backward pass
-        at::Tensor chunk;
-        if(use_fp32_qkv_grad)
-            chunk = at::empty({B, M, 3, Hq, K}, opts.dtype(at::kFloat));
-        else
-            chunk = at::empty({B, M, 3, Hq, K}, opts);
-        grad_q = chunk.select(2, 0);
-        grad_k = chunk.select(2, 1);
-        grad_v = chunk.select(2, 2);
-        grad_q.fill_(0);
+    if (use_fp32_qkv_grad)
+      grad_q = at::empty_strided(
+          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
+    else
+      grad_q =
+          at::empty_strided(query.sizes(), query.strides(), query.options());
+    grad_q.fill_(0);
+  } else {
+    if (use_fp32_qkv_grad) {
+      grad_q = at::empty_strided(
+          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
+      grad_k = at::empty_strided(
+          key.sizes(), key.strides(), key.options().dtype(at::kFloat));
+      grad_v = at::empty_strided(
+          value.sizes(), value.strides(), value.options().dtype(at::kFloat));
+    } else {
+      grad_q =
+          at::empty_strided(query.sizes(), query.strides(), query.options());
+      grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
+      grad_v =
+          at::empty_strided(value.sizes(), value.strides(), value.options());
     }
-    else if(key.size(3) == value.size(3) && key.storage().is_alias_of(value.storage()))
-    {
-        // Create one big contiguous chunk for grad_k, grad_v
-        // This is because k and v usually come from a single
-        // output of a linear layer that is chunked.
-        // Creating the gradients with the right layout saves us
-        // a `torch.cat` call in the backward pass
-        at::Tensor chunk;
-        if(use_fp32_qkv_grad)
-            chunk = at::empty({B, N, 2, Hkv, Kv}, opts.dtype(at::kFloat));
-        else
-            chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
-        grad_k = chunk.select(2, 0);
-        grad_v = chunk.select(2, 1);
-
-        if(use_fp32_qkv_grad)
-            grad_q = at::empty_strided(
-                query.sizes(), query.strides(), query.options().dtype(at::kFloat));
-        else
-            grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
-        grad_q.fill_(0);
+    grad_q.fill_(0);
+  }
+
+  // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
+  TORCH_CHECK(query.sizes() == grad_q.sizes());
+  TORCH_CHECK(query.strides() == grad_q.strides());
+  TORCH_CHECK(key.sizes() == grad_k.sizes());
+  TORCH_CHECK(key.strides() == grad_k.strides());
+  TORCH_CHECK(value.sizes() == grad_v.sizes());
+  TORCH_CHECK(value.strides() == grad_v.strides());
+
+  const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
+
+  // even it is an output, the grad_bias is required to use the same data-type
+  // as bias in CK-FlashAttn
+  if (bias_requires_grad)
+    grad_bias =
+        at::empty_strided(bias->sizes(), bias->strides(), bias->options());
+
+  bool is_mqa_gqa = (Hq > Hkv);
+
+  at::Tensor tmp_grad_k, tmp_grad_v;
+
+  if (is_mqa_gqa) {
+    // allocate tmp_grad_k/tmp_grad_v which will be reduce to
+    // grad_k/grad_v for returning
+    if (use_fp32_qkv_grad) {
+      tmp_grad_k = at::empty({B, N, Hq, K}, opts.dtype(at::kFloat));
+      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts.dtype(at::kFloat));
+    } else {
+      tmp_grad_k = at::empty({B, N, Hq, K}, opts);
+      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
     }
-    else
-    {
-        if(use_fp32_qkv_grad)
-        {
-            grad_q = at::empty_strided(
-                query.sizes(), query.strides(), query.options().dtype(at::kFloat));
-            grad_k = at::empty_strided(key.sizes(), key.strides(), key.options().dtype(at::kFloat));
-            grad_v = at::empty_strided(
-                value.sizes(), value.strides(), value.options().dtype(at::kFloat));
-        }
-        else
-        {
-            grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
-            grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
-            grad_v = at::empty_strided(value.sizes(), value.strides(), value.options());
-        }
-        grad_q.fill_(0);
+  }
+
+  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+    p.is_mqa_gqa = is_mqa_gqa;
+
+    TORCH_CHECK(p.B == logsumexp.size(0));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
+    TORCH_CHECK(p.M == logsumexp.size(2));
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
     }
 
-    // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
-    TORCH_CHECK(query.sizes() == grad_q.sizes());
-    TORCH_CHECK(query.strides() == grad_q.strides());
-    TORCH_CHECK(key.sizes() == grad_k.sizes());
-    TORCH_CHECK(key.strides() == grad_k.strides());
-    TORCH_CHECK(value.sizes() == grad_v.sizes());
-    TORCH_CHECK(value.strides() == grad_v.strides());
-
-    const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
-
-    // even it is an output, the grad_bias is required to use the same data-type
-    // as bias in CK-FlashAttn
-    if(bias_requires_grad)
-        grad_bias = at::empty_strided(bias->sizes(), bias->strides(), bias->options());
-
-    bool is_mqa_gqa = (Hq > Hkv);
-
-    at::Tensor tmp_grad_k, tmp_grad_v;
-
-    if(is_mqa_gqa)
-    {
-        // allocate tmp_grad_k/tmp_grad_v which will be reduce to
-        // grad_k/grad_v for returning
-        if(use_fp32_qkv_grad)
-        {
-            tmp_grad_k = at::empty({B, N, Hq, K}, opts.dtype(at::kFloat));
-            tmp_grad_v = at::empty({B, N, Hq, Kv}, opts.dtype(at::kFloat));
-        }
-        else
-        {
-            tmp_grad_k = at::empty({B, N, Hq, K}, opts);
-            tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
-        }
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.grad_out_ptr = grad_out.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.grad_q_ptr = grad_q.data_ptr();
+    p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
+    p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (is_mqa_gqa) {
+      p.tmp_grad_k_strides = {
+          static_cast<int>(tmp_grad_k.stride(0)),
+          static_cast<int>(tmp_grad_k.stride(1)),
+          static_cast<int>(tmp_grad_k.stride(2)),
+          static_cast<int>(tmp_grad_k.stride(3))};
+      p.tmp_grad_v_strides = {
+          static_cast<int>(tmp_grad_v.stride(0)),
+          static_cast<int>(tmp_grad_v.stride(1)),
+          static_cast<int>(tmp_grad_v.stride(2)),
+          static_cast<int>(tmp_grad_v.stride(3))};
     }
 
-    auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
-        p.B   = B;
-        p.M   = M;
-        p.N   = N;
-        p.Hq  = Hq;
-        p.Hkv = Hkv;
-        p.K   = K;
-        p.Kv  = Kv;
-
-        p.use_fp32_qkv_grad = use_fp32_qkv_grad;
-        p.is_mqa_gqa        = is_mqa_gqa;
-
-        TORCH_CHECK(p.B == logsumexp.size(0));
-        TORCH_CHECK(p.Hq == logsumexp.size(1));
-        TORCH_CHECK(p.M == logsumexp.size(2));
-
-        if(scale.has_value())
-        {
-            p.scale = float(*scale);
-        }
-        else
-        {
-            p.scale = float(1.0 / std::sqrt(float(K)));
-        }
-
-        p.q_ptr        = query.data_ptr();
-        p.k_ptr        = key.data_ptr();
-        p.v_ptr        = value.data_ptr();
-        p.grad_out_ptr = grad_out.data_ptr();
-        p.out_ptr      = out.data_ptr();
-
-        p.grad_q_ptr = grad_q.data_ptr();
-        p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
-        p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
-
-        p.q_strides   = {static_cast<int>(query.stride(0)),
-                       static_cast<int>(query.stride(1)),
-                       static_cast<int>(query.stride(2)),
-                       static_cast<int>(query.stride(3))};
-        p.k_strides   = {static_cast<int>(key.stride(0)),
-                       static_cast<int>(key.stride(1)),
-                       static_cast<int>(key.stride(2)),
-                       static_cast<int>(key.stride(3))};
-        p.v_strides   = {static_cast<int>(value.stride(0)),
-                       static_cast<int>(value.stride(1)),
-                       static_cast<int>(value.stride(2)),
-                       static_cast<int>(value.stride(3))};
-        p.out_strides = {static_cast<int>(out.stride(0)),
-                         static_cast<int>(out.stride(1)),
-                         static_cast<int>(out.stride(2)),
-                         static_cast<int>(out.stride(3))};
-
-        if(is_mqa_gqa)
-        {
-            p.tmp_grad_k_strides = {static_cast<int>(tmp_grad_k.stride(0)),
-                                    static_cast<int>(tmp_grad_k.stride(1)),
-                                    static_cast<int>(tmp_grad_k.stride(2)),
-                                    static_cast<int>(tmp_grad_k.stride(3))};
-            p.tmp_grad_v_strides = {static_cast<int>(tmp_grad_v.stride(0)),
-                                    static_cast<int>(tmp_grad_v.stride(1)),
-                                    static_cast<int>(tmp_grad_v.stride(2)),
-                                    static_cast<int>(tmp_grad_v.stride(3))};
-        }
-
-        if(bias.has_value())
-        {
-            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-            p.has_attn_bias = true;
-            p.attn_bias_ptr = bias->data_ptr();
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
-            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
 
-            p.attn_bias_strides = {static_cast<int>(bias_4d_view.stride(0)),
-                                   static_cast<int>(bias_4d_view.stride(1)),
-                                   static_cast<int>(bias_4d_view.stride(2)),
-                                   static_cast<int>(bias_4d_view.stride(3))};
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
 
-            if(bias_requires_grad)
-                p.grad_bias_ptr = grad_bias.data_ptr();
-        }
-        else
-        {
-            p.has_attn_bias = true;
-            p.attn_bias_ptr = nullptr;
-            p.grad_bias_ptr = nullptr;
-        }
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
 
-        p.bias_has_grad = bias_requires_grad;
+      if (bias_requires_grad)
+        p.grad_bias_ptr = grad_bias.data_ptr();
+    } else {
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = nullptr;
+      p.grad_bias_ptr = nullptr;
+    }
 
-        p.custom_mask_type = custom_mask_type;
+    p.bias_has_grad = bias_requires_grad;
 
-        p.dropout_prob  = static_cast<float>(dropout_p);
-        p.philox_seed   = rng_seed;
-        p.philox_offset = rng_offset;
+    p.custom_mask_type = custom_mask_type;
 
-        p.logsumexp_ptr = logsumexp.data_ptr();
-    };
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.philox_seed = rng_seed;
+    p.philox_offset = rng_offset;
 
-    auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
-        p.num_batches = seqstart_q->size(0) - 1;
-        p.M           = M;
-        p.N           = N;
-        p.Hq          = Hq;
-        p.Hkv         = Hkv;
-        p.K           = K;
-        p.Kv          = Kv;
+    p.logsumexp_ptr = logsumexp.data_ptr();
+  };
 
-        p.use_fp32_qkv_grad = use_fp32_qkv_grad;
-        p.is_mqa_gqa        = is_mqa_gqa;
+  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
 
-        p.max_seqlen_q = *max_seqlen_q_;
+    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
+    p.is_mqa_gqa = is_mqa_gqa;
 
-        TORCH_CHECK(p.num_batches == logsumexp.size(0));
-        TORCH_CHECK(p.Hq == logsumexp.size(1));
-        TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
+    p.max_seqlen_q = *max_seqlen_q_;
 
-        if(scale.has_value())
-        {
-            p.scale = float(*scale);
-        }
-        else
-        {
-            p.scale = float(1.0 / std::sqrt(float(K)));
-        }
+    TORCH_CHECK(p.num_batches == logsumexp.size(0));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
+    TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
 
-        p.q_strides   = {static_cast<int>(query.stride(1)),
-                       static_cast<int>(query.stride(2)),
-                       static_cast<int>(query.stride(3))};
-        p.k_strides   = {static_cast<int>(key.stride(1)),
-                       static_cast<int>(key.stride(2)),
-                       static_cast<int>(key.stride(3))};
-        p.v_strides   = {static_cast<int>(value.stride(1)),
-                       static_cast<int>(value.stride(2)),
-                       static_cast<int>(value.stride(3))};
-        p.out_strides = {static_cast<int>(out.stride(1)),
-                         static_cast<int>(out.stride(2)),
-                         static_cast<int>(out.stride(3))};
-
-        if(is_mqa_gqa)
-        {
-            p.tmp_grad_k_strides = {static_cast<int>(tmp_grad_k.stride(1)),
-                                    static_cast<int>(tmp_grad_k.stride(2)),
-                                    static_cast<int>(tmp_grad_k.stride(3))};
-            p.tmp_grad_v_strides = {static_cast<int>(tmp_grad_v.stride(1)),
-                                    static_cast<int>(tmp_grad_v.stride(2)),
-                                    static_cast<int>(tmp_grad_v.stride(3))};
-        };
-
-        if(bias.has_value())
-        {
-            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-            p.has_attn_bias               = true;
-            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
-                                   static_cast<int>(bias_4d_view.stride(1)),
-                                   static_cast<int>(bias_4d_view.stride(2)),
-                                   static_cast<int>(bias_4d_view.stride(3))};
-        }
-        else
-            p.has_attn_bias = false;
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
 
-        p.bias_has_grad = bias_requires_grad;
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (is_mqa_gqa) {
+      p.tmp_grad_k_strides = {
+          static_cast<int>(tmp_grad_k.stride(1)),
+          static_cast<int>(tmp_grad_k.stride(2)),
+          static_cast<int>(tmp_grad_k.stride(3))};
+      p.tmp_grad_v_strides = {
+          static_cast<int>(tmp_grad_v.stride(1)),
+          static_cast<int>(tmp_grad_v.stride(2)),
+          static_cast<int>(tmp_grad_v.stride(3))};
+    };
 
-        p.dropout_prob  = static_cast<float>(dropout_p);
-        p.philox_seed   = rng_seed;
-        p.philox_offset = rng_offset;
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
 
-        p.custom_mask_type = custom_mask_type;
+      p.has_attn_bias = true;
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
 
-        p.host_seqstart_q.resize(p.num_batches + 1);
-        p.host_seqstart_k.resize(p.num_batches + 1);
+    p.bias_has_grad = bias_requires_grad;
 
-        for(int i = 0; i < p.host_seqstart_q.size(); i++)
-            p.host_seqstart_q[i] = *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.philox_seed = rng_seed;
+    p.philox_offset = rng_offset;
 
-        for(int i = 0; i < p.host_seqstart_k.size(); i++)
-            p.host_seqstart_k[i] = *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
+    p.custom_mask_type = custom_mask_type;
 
-        if(seqlen_k.has_value())
-        {
-            TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-            TORCH_CHECK(seqlen_k->dim() == 1);
-            TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-            CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
 
-            p.host_seqlen_k.resize(p.num_batches);
+    for (int i = 0; i < p.host_seqstart_q.size(); i++)
+      p.host_seqstart_q[i] =
+          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
 
-            for(int i = 0; i < p.host_seqlen_k.size(); i++)
-                p.host_seqlen_k[i] = *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
-        }
+    for (int i = 0; i < p.host_seqstart_k.size(); i++)
+      p.host_seqstart_k[i] =
+          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
 
-        char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-        char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-        char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-        char* out_ptr      = reinterpret_cast<char*>(out.data_ptr());
-        char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
-        char* attn_bias_ptr =
-            bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-        char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-        char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
-        char* grad_k_ptr = is_mqa_gqa ? reinterpret_cast<char*>(tmp_grad_k.data_ptr())
-                                      : reinterpret_cast<char*>(grad_k.data_ptr());
-        char* grad_v_ptr = is_mqa_gqa ? reinterpret_cast<char*>(tmp_grad_v.data_ptr())
-                                      : reinterpret_cast<char*>(grad_v.data_ptr());
-        char* grad_bias_ptr =
-            bias_requires_grad ? reinterpret_cast<char*>(grad_bias.data_ptr()) : nullptr;
-
-        size_t multiplier = 1;
-
-        if(p.use_fp32_qkv_grad)
-            multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
-                         get_size_in_bytes(1, query.scalar_type());
-
-        std::cout << "qkv-grad precision multiplier is " << multiplier << std::endl;
-
-        for(int i = 0; i < p.num_batches; i++)
-        {
-            size_t tmp_q_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0], query.scalar_type());
-            size_t tmp_k_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0], key.scalar_type());
-            size_t tmp_v_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0], value.scalar_type());
-            size_t tmp_o_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0], out.scalar_type());
-            size_t tmp_logsumexp_offset = get_size_in_bytes(
-                static_cast<size_t>(i) * p.Hq * p.max_seqlen_q, logsumexp.scalar_type());
-
-            size_t tmp_grad_k_offset =
-                is_mqa_gqa ? get_size_in_bytes(static_cast<size_t>(p.host_seqstart_k[i]) *
-                                                   p.tmp_grad_k_strides[0],
-                                               tmp_grad_k.scalar_type())
-                           : tmp_k_offset;
-            size_t tmp_grad_v_offset =
-                is_mqa_gqa ? get_size_in_bytes(static_cast<size_t>(p.host_seqstart_k[i]) *
-                                                   p.tmp_grad_v_strides[0],
-                                               tmp_grad_v.scalar_type())
-                           : tmp_v_offset;
-
-            p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-            p.grad_q_ptrs.push_back(
-                reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
-
-            p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-            p.grad_k_ptrs.push_back(
-                reinterpret_cast<void*>(&grad_k_ptr[tmp_grad_k_offset * multiplier]));
-
-            p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-            p.grad_v_ptrs.push_back(
-                reinterpret_cast<void*>(&grad_v_ptr[tmp_grad_v_offset * multiplier]));
-
-            p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-            p.grad_out_ptrs.push_back(reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
-
-            p.logsumexp_ptrs.push_back(
-                reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-
-            if(bias.has_value())
-            {
-                size_t tmp_bias_offset = get_size_in_bytes(
-                    static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                        static_cast<size_t>(p.host_seqstart_k[i]) * p.attn_bias_strides[3],
-                    bias->scalar_type());
-
-                p.attn_bias_ptrs.push_back(
-                    reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-
-                if(bias_requires_grad)
-                {
-                    p.grad_bias_ptrs.push_back(
-                        reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
-                }
-            }
-
-            // ToDO: remove this after dev-op fix
-            p.randvals_ptrs.push_back(nullptr);
-        }
-    };
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
 
-    auto inDataType = query.scalar_type();
+      p.host_seqlen_k.resize(p.num_batches);
 
-    if(!seqstart_q.has_value())
-    { // input is batched
-        BatchedBackwardParams batched_backward_params;
-
-        set_batched_backward_params(batched_backward_params);
-
-        if(inDataType == at::ScalarType::Half)
-        {
-            batched_backward_fp16(batched_backward_params, stream);
-        }
-        else if(inDataType == at::ScalarType::BFloat16)
-        {
-            batched_backward_bp16(batched_backward_params, stream);
-        }
-        else
-            throw std::runtime_error("input data-type is not supported");
+      for (int i = 0; i < p.host_seqlen_k.size(); i++)
+        p.host_seqlen_k[i] =
+            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
     }
-    else
-    { // input is grouped
-        GroupedBackwardParams grouped_backward_params;
 
-        set_grouped_backward_params(grouped_backward_params);
-
-        if(inDataType == at::ScalarType::Half)
-        {
-            grouped_backward_fp16(grouped_backward_params, stream);
-        }
-        else if(inDataType == at::ScalarType::BFloat16)
-        {
-            grouped_backward_bp16(grouped_backward_params, stream);
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
+    char* attn_bias_ptr =
+        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
+
+    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+
+    char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
+    char* grad_k_ptr = is_mqa_gqa
+        ? reinterpret_cast<char*>(tmp_grad_k.data_ptr())
+        : reinterpret_cast<char*>(grad_k.data_ptr());
+    char* grad_v_ptr = is_mqa_gqa
+        ? reinterpret_cast<char*>(tmp_grad_v.data_ptr())
+        : reinterpret_cast<char*>(grad_v.data_ptr());
+    char* grad_bias_ptr = bias_requires_grad
+        ? reinterpret_cast<char*>(grad_bias.data_ptr())
+        : nullptr;
+
+    size_t multiplier = 1;
+
+    if (p.use_fp32_qkv_grad)
+      multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
+          get_size_in_bytes(1, query.scalar_type());
+
+    std::cout << "qkv-grad precision multiplier is " << multiplier << std::endl;
+
+    for (int i = 0; i < p.num_batches; i++) {
+      size_t tmp_q_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
+          query.scalar_type());
+      size_t tmp_k_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
+          key.scalar_type());
+      size_t tmp_v_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
+          value.scalar_type());
+      size_t tmp_o_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
+          out.scalar_type());
+      size_t tmp_logsumexp_offset = get_size_in_bytes(
+          static_cast<size_t>(i) * p.Hq * p.max_seqlen_q,
+          logsumexp.scalar_type());
+
+      size_t tmp_grad_k_offset = is_mqa_gqa
+          ? get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.tmp_grad_k_strides[0],
+                tmp_grad_k.scalar_type())
+          : tmp_k_offset;
+      size_t tmp_grad_v_offset = is_mqa_gqa
+          ? get_size_in_bytes(
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.tmp_grad_v_strides[0],
+                tmp_grad_v.scalar_type())
+          : tmp_v_offset;
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
+      p.grad_q_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
+
+      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
+      p.grad_k_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_k_ptr[tmp_grad_k_offset * multiplier]));
+
+      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
+      p.grad_v_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_v_ptr[tmp_grad_v_offset * multiplier]));
+
+      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
+      p.grad_out_ptrs.push_back(
+          reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
+
+      p.logsumexp_ptrs.push_back(
+          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+
+      if (bias.has_value()) {
+        size_t tmp_bias_offset = get_size_in_bytes(
+            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(
+            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+
+        if (bias_requires_grad) {
+          p.grad_bias_ptrs.push_back(
+              reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
         }
-        else
-            throw std::runtime_error("input data-type is not supported");
-    }
+      }
 
-    if(is_mqa_gqa)
-    {
-        auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
-        auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
-        grad_k               = tmp_grad_k_view.sum(3);
-        grad_v               = tmp_grad_v_view.sum(3);
+      // ToDO: remove this after dev-op fix
+      p.randvals_ptrs.push_back(nullptr);
     }
-
-    return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
+  };
+
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedBackwardParams batched_backward_params;
+
+    set_batched_backward_params(batched_backward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      batched_backward_fp16(batched_backward_params, stream);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      batched_backward_bp16(batched_backward_params, stream);
+    } else
+      throw std::runtime_error("input data-type is not supported");
+  } else { // input is grouped
+    GroupedBackwardParams grouped_backward_params;
+
+    set_grouped_backward_params(grouped_backward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      grouped_backward_fp16(grouped_backward_params, stream);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      grouped_backward_bp16(grouped_backward_params, stream);
+    } else
+      throw std::runtime_error("input data-type is not supported");
+  }
+
+  if (is_mqa_gqa) {
+    auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
+    auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
+    grad_k = tmp_grad_k_view.sum(3);
+    grad_v = tmp_grad_v_view.sum(3);
+  }
+
+  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 #endif
 } // namespace
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m)
-{
-    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
-           TORCH_FN(efficient_attention_backward_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
+      TORCH_FN(efficient_attention_backward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index a4282834a..ecf73c09b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -26,91 +26,100 @@ namespace {
  * generate a tensor with random uniform values. only used for testing, not much
  * attention is paid to performance
  */
-at::Tensor
-rand_uniform_int(double dropout_prob,
-                 const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
+at::Tensor rand_uniform_int(
+    double dropout_prob,
+    const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
 {
-    int B         = out_pattern.size(0);
-    int num_heads = out_pattern.size(1);
-    int M         = out_pattern.size(2);
-    int N         = out_pattern.size(3);
-
-    // at::cuda::CUDAGuard device_guard(out_pattern.device());
-    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-    at::CUDAGeneratorImpl* gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-    at::PhiloxCudaState rng_engine_inputs;
-    {
-        std::lock_guard<std::mutex> lock(gen->mutex_);
-        rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
-    }
-
-    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-    int64_t philox_seed   = std::get<0>(seeds);
-    int64_t philox_offset = std::get<1>(seeds);
-
-    at::Tensor randvals;
-
-    randvals = at::empty({B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-
-    static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default;
-    static constexpr auto TensorSpecB0 =
-        ck::tensor_operation::device::TensorSpecialization::Default;
-    static constexpr auto TensorSpecB1 =
-        ck::tensor_operation::device::TensorSpecialization::Default;
-    static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default;
-
-    using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedDropout<2, // NumDimG
-                                                                                ck::half_t,
-                                                                                int,
-                                                                                ck::half_t,
-                                                                                GemmSpec,
-                                                                                TensorSpecA,
-                                                                                TensorSpecB0,
-                                                                                TensorSpecB1,
-                                                                                TensorSpecC,
-                                                                                256, // BlockSize
-                                                                                64,  // MPerBlock
-                                                                                128, // NPerBlock
-                                                                                32,  // KPerBlock
-                                                                                8,   // AK1
-                                                                                8,   // BK1
-                                                                                32,  // MPerXDL
-                                                                                32,  // NPerXDL
-                                                                                2,   // MXdlPerWave
-                                                                                1>;  // NXdlPerWave
-
-    const uint64_t seed   = 1;
-    const uint64_t offset = 0;
-
-    std::vector<ck::index_t> z_gs_ms_ns_lengths = {B, num_heads, M, N};
-    std::vector<ck::index_t> z_gs_ms_ns_strides = {static_cast<int>(randvals.stride(0)),
-                                                   static_cast<int>(randvals.stride(1)),
-                                                   static_cast<int>(randvals.stride(2)),
-                                                   static_cast<int>(randvals.stride(3))};
-
-    auto dropout_op      = DeviceOpInstance();
-    auto dropout_invoker = dropout_op.MakeInvoker();
-
-    auto dropout_arg = dropout_op.MakeArgument(static_cast<int*>(randvals.data_ptr()),
-                                               z_gs_ms_ns_lengths,
-                                               z_gs_ms_ns_strides,
-                                               {philox_seed, philox_offset});
-
-    dropout_invoker.Run(dropout_arg, StreamConfig{stream, false});
-    (void)hipStreamSynchronize(stream);
-
-    return randvals;
+  int B = out_pattern.size(0);
+  int num_heads = out_pattern.size(1);
+  int M = out_pattern.size(2);
+  int N = out_pattern.size(3);
+
+  // at::cuda::CUDAGuard device_guard(out_pattern.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  at::CUDAGeneratorImpl* gen =
+      at::get_generator_or_default<at::CUDAGeneratorImpl>(
+          c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+  at::PhiloxCudaState rng_engine_inputs;
+  {
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+  }
+
+  const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+  int64_t philox_seed = std::get<0>(seeds);
+  int64_t philox_offset = std::get<1>(seeds);
+
+  at::Tensor randvals;
+
+  randvals = at::empty(
+      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+
+  using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedDropout<
+      2, // NumDimG
+      ck::half_t,
+      int,
+      ck::half_t,
+      GemmSpec,
+      TensorSpecA,
+      TensorSpecB0,
+      TensorSpecB1,
+      TensorSpecC,
+      256, // BlockSize
+      64, // MPerBlock
+      128, // NPerBlock
+      32, // KPerBlock
+      8, // AK1
+      8, // BK1
+      32, // MPerXDL
+      32, // NPerXDL
+      2, // MXdlPerWave
+      1>; // NXdlPerWave
+
+  const uint64_t seed = 1;
+  const uint64_t offset = 0;
+
+  std::vector<ck::index_t> z_gs_ms_ns_lengths = {B, num_heads, M, N};
+  std::vector<ck::index_t> z_gs_ms_ns_strides = {
+      static_cast<int>(randvals.stride(0)),
+      static_cast<int>(randvals.stride(1)),
+      static_cast<int>(randvals.stride(2)),
+      static_cast<int>(randvals.stride(3))};
+
+  auto dropout_op = DeviceOpInstance();
+  auto dropout_invoker = dropout_op.MakeInvoker();
+
+  auto dropout_arg = dropout_op.MakeArgument(
+      static_cast<int*>(randvals.data_ptr()),
+      z_gs_ms_ns_lengths,
+      z_gs_ms_ns_strides,
+      {philox_seed, philox_offset});
+
+  dropout_invoker.Run(dropout_arg, StreamConfig{stream, false});
+  (void)hipStreamSynchronize(stream);
+
+  return randvals;
 } // namespace
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m)
-{
-    m.impl(TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"), TORCH_FN(rand_uniform_int));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"),
+      TORCH_FN(rand_uniform_int));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 99de91741..6fe0137b0 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -15,8 +15,8 @@
 
 namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock  = 16;
-constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
+constexpr int32_t kWavefrontsPerBlock = 16;
+constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
 } // namespace
 
 namespace {
@@ -24,129 +24,135 @@ namespace {
 template <typename c10_t>
 struct c10_to_data_t;
 template <>
-struct c10_to_data_t<float>
-{
-    using type = float;
+struct c10_to_data_t<float> {
+  using type = float;
 };
 
 template <>
-struct c10_to_data_t<c10::Half>
-{
-    using type = ck::half_t;
+struct c10_to_data_t<c10::Half> {
+  using type = ck::half_t;
 };
 
 template <>
-struct c10_to_data_t<c10::BFloat16>
-{
-    using type = ck::bhalf_t;
+struct c10_to_data_t<c10::BFloat16> {
+  using type = ck::bhalf_t;
 };
 } // namespace
 
 namespace {
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
-    AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
-
-#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
-    AT_DISPATCH_SWITCH(                                                              \
-        TYPE, NAME, AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
-
-template <int32_t ThreadsPerWavefront,
-          int32_t WavefrontsPerBlock,
-          int32_t KV_M_MAX = 8192,
-          int32_t K_MAX    = 256>
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_SWITCH_3(                               \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+template <
+    int32_t ThreadsPerWavefront,
+    int32_t WavefrontsPerBlock,
+    int32_t KV_M_MAX = 8192,
+    int32_t K_MAX = 256>
 at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
-    at::Tensor& O)
-{
-    static_assert(4 * ThreadsPerWavefront == K_MAX, "");
-    static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
-
-    at::OptionalDeviceGuard guard(XQ.device());
-    TORCH_CHECK(XQ.is_cuda());
-    TORCH_CHECK(cache_K.is_cuda());
-    TORCH_CHECK(cache_V.is_cuda());
-
-    TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
-
-    TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
-    TORCH_CHECK(cache_K.size(4) <= K_MAX);
-
-    constexpr auto rank = 5;
-
-    auto B = XQ.size(0);
-    auto M = XQ.size(1);
-    auto G = XQ.size(2);
-    auto H = XQ.size(3);
-
-    TORCH_CHECK(B <= 1024);
-    TORCH_CHECK(M <= 1024);
-    TORCH_CHECK(H <= 1024);
-
-    dim3 blocks(B * H * M * G);
-    dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
-
-    int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-    int32_t smem_output  = K_MAX * sizeof(float) *
-                          threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-    const size_t lds_bytes = max(smem_softmax, smem_output);
-    auto stream            = at::cuda::getCurrentHIPStream().stream();
-
-    AT_DISPATCH_SWITCH_3(
-        at::ScalarType::Half,
-        at::ScalarType::BFloat16,
-        at::ScalarType::Float,
-        XQ.scalar_type(),
-        "efficient_attention_forward_decoder_ck",
-        [&] {
-            using ck_data_t   = c10_to_data_t<scalar_t>::type;
-            using device_op_t = ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<ck_data_t>;
-            auto op           = device_op_t{};
-
-            auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto K_acc  = cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-            auto V_acc  = cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-            auto O_acc  = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto seq_acc =
-                seq_kv_lens
-                    ? seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data()
-                    : nullptr;
-            auto arg = device_op_t::Argument(
-                reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
-                reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
-                reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
-                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-                seq_acc,
-                XQ_acc.stride(0),
-                XQ_acc.stride(1),
-                XQ_acc.stride(2),
-                XQ_acc.stride(3),
-                K_acc.stride(0),
-                K_acc.stride(1),
-                K_acc.stride(2),
-                K_acc.stride(3),
-                XQ_acc.size(1),
-                XQ_acc.size(2),
-                XQ_acc.size(3),
-                XQ_acc.size(4),
-                K_acc.size(1),
-                K_acc.size(3) == 1,
-                qk_scale,
-                blocks,
-                threads,
-                lds_bytes);
-
-            auto invoker = device_op_t::Invoker{};
-            (void)invoker.Run(arg, {stream});
-        });
-
-    return O;
+    at::Tensor& O) {
+  static_assert(4 * ThreadsPerWavefront == K_MAX, "");
+  static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
+
+  at::OptionalDeviceGuard guard(XQ.device());
+  TORCH_CHECK(XQ.is_cuda());
+  TORCH_CHECK(cache_K.is_cuda());
+  TORCH_CHECK(cache_V.is_cuda());
+
+  TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
+
+  TORCH_CHECK(cache_K.size(1) <= KV_M_MAX);
+  TORCH_CHECK(cache_K.size(4) <= K_MAX);
+
+  constexpr auto rank = 5;
+
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+
+  TORCH_CHECK(B <= 1024);
+  TORCH_CHECK(M <= 1024);
+  TORCH_CHECK(H <= 1024);
+
+  dim3 blocks(B * H * M * G);
+  dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
+
+  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = K_MAX * sizeof(float) *
+      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  const size_t lds_bytes = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      XQ.scalar_type(),
+      "efficient_attention_forward_decoder_ck",
+      [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSeqlen1DeviceOp<ck_data_t>;
+        auto op = device_op_t{};
+
+        auto XQ_acc =
+            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto K_acc =
+            cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto V_acc =
+            cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto O_acc =
+            O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto seq_acc = seq_kv_lens
+            ? seq_kv_lens
+                  ->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>()
+                  .data()
+            : nullptr;
+        auto arg = device_op_t::Argument(
+            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+            seq_acc,
+            XQ_acc.stride(0),
+            XQ_acc.stride(1),
+            XQ_acc.stride(2),
+            XQ_acc.stride(3),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.stride(3),
+            XQ_acc.size(1),
+            XQ_acc.size(2),
+            XQ_acc.size(3),
+            XQ_acc.size(4),
+            K_acc.size(1),
+            K_acc.size(3) == 1,
+            qk_scale,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
+      });
+
+  return O;
 }
 
 #undef AT_DISPATCH_CASE_3
@@ -154,34 +160,34 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor efficient_attention_forward_decoder_ck_impl(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, H or 1, D]
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
-    double qk_scale)
-{
-    auto O = at::empty_like(XQ);
-    efficient_attention_forward_decoder_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
-        XQ, cache_K, cache_V, seq_kv_lens, qk_scale, O);
-    return O;
+    double qk_scale) {
+  auto O = at::empty_like(XQ);
+  efficient_attention_forward_decoder_ck_out_impl<
+      ThreadsPerWavefront,
+      WavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale, O);
+  return O;
 }
 
-at::Tensor
-efficient_attention_forward_decoder_ck(const at::Tensor& XQ,      // [B, 1, G, H, D]
-                                       const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-                                       const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
-                                       at::optional<at::Tensor> seq_kv_lens, // [B]
-                                       double qk_scale)
-{
-    return efficient_attention_forward_decoder_ck_impl<kThreadsPerWavefront, kWavefrontsPerBlock>(
-        XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
+at::Tensor efficient_attention_forward_decoder_ck(
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    at::optional<at::Tensor> seq_kv_lens, // [B]
+    double qk_scale) {
+  return efficient_attention_forward_decoder_ck_impl<
+      kThreadsPerWavefront,
+      kWavefrontsPerBlock>(XQ, cache_K, cache_V, seq_kv_lens, qk_scale);
 }
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m)
-{
-    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
-           TORCH_FN(efficient_attention_forward_decoder_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_ck"),
+      TORCH_FN(efficient_attention_forward_decoder_ck));
 }
 
 #ifdef ATTN_FWD_DECODER_MAIN
@@ -217,109 +223,111 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m)
 
 // clang-format on
 
-static void do_correctness_check()
-{
-    const int32_t D = 4 * kThreadsPerWavefront;
-    const int32_t B = 1;
-    const int32_t H = 4;
-    const int32_t G = 1;
-    auto options    = torch::TensorOptions()
-                       .dtype(torch::kFloat32)
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA, 1)
-                       .requires_grad(false);
-    auto int_options = options.dtype(torch::kInt);
-    auto XQ          = at::randn({B, 1, G, H, D}, options);
-    auto K           = at::randn({B, 4096, G, H, D}, options);
-    auto V           = at::randn({B, 4096, G, H, D}, options);
-    auto seq         = at::randint(63, 128, {B}, int_options);
-    double qk_scale  = 1. / sqrt(D);
-
-    auto result      = efficient_attention_forward_decoder_ck_impl<64, 1>(XQ, K, V, seq, qk_scale);
-    auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(XQ, K, V, seq, qk_scale);
-    auto mask = at::isclose(result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    printf("Mismatched elements percentage: %.2f\n", 1. - percent_match.item<float>());
+static void do_correctness_check() {
+  const int32_t D = 4 * kThreadsPerWavefront;
+  const int32_t B = 1;
+  const int32_t H = 4;
+  const int32_t G = 1;
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kFloat32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCUDA, 1)
+                     .requires_grad(false);
+  auto int_options = options.dtype(torch::kInt);
+  auto XQ = at::randn({B, 1, G, H, D}, options);
+  auto K = at::randn({B, 4096, G, H, D}, options);
+  auto V = at::randn({B, 4096, G, H, D}, options);
+  auto seq = at::randint(63, 128, {B}, int_options);
+  double qk_scale = 1. / sqrt(D);
+
+  auto result = efficient_attention_forward_decoder_ck_impl<64, 1>(
+      XQ, K, V, seq, qk_scale);
+  auto gold_result = efficient_attention_forward_decoder_ck_impl<64, 2>(
+      XQ, K, V, seq, qk_scale);
+  auto mask = at::isclose(
+      result, gold_result, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+  printf(
+      "Mismatched elements percentage: %.2f\n",
+      1. - percent_match.item<float>());
 }
 
-int main(int argc, char** argv)
-{
-    if(argc == 1)
-    {
-        do_correctness_check();
+int main(int argc, char** argv) {
+  if (argc == 1) {
+    do_correctness_check();
+  } else {
+    const auto args = std::vector<std::string>(argv + 1, argv + argc);
+    if (args.size() != 7) {
+      std::cout
+          << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype "
+             "n_wavefronts_per_block"
+          << std::endl;
+      return 0;
     }
-    else
-    {
-        const auto args = std::vector<std::string>(argv + 1, argv + argc);
-        if(args.size() != 7)
-        {
-            std::cout << "Usage: ./a.out n_keys padding batch_size n_heads is_multiquery dtype "
-                         "n_wavefronts_per_block"
-                      << std::endl;
-            return 0;
-        }
-        const int32_t n_keys     = std::stoi(args[0]);
-        const int32_t padding    = std::stoi(args[1]);
-        const int32_t batch_size = std::stoi(args[2]);
-        const int32_t n_heads    = std::stoi(args[3]);
-        const int32_t n_groups   = 1;
-        const int32_t multiquery = (args[4] == "mq");
-        const auto dtype         = (args[5] == "f32")
-                               ? torch::kFloat32
-                               : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
-        const int32_t n_wavefronts_per_block = std::stoi(args[6]);
-
-        const int32_t dim_per_head = 4 * kThreadsPerWavefront;
-
-        const auto options = torch::TensorOptions()
-                                 .dtype(dtype)
-                                 .layout(torch::kStrided)
-                                 .device(torch::kCUDA, 1)
-                                 .requires_grad(false);
-
-        const auto int_options = options.dtype(torch::kInt);
-        const auto Q = at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
-        const auto K =
-            multiquery ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
-                             .expand({batch_size, padding, n_groups, n_heads, dim_per_head})
-                       : at::rand({batch_size, padding, n_groups, n_heads, dim_per_head}, options);
-        const auto V = at::rand_like(K);
-        auto O       = at::empty_like(Q);
-
-        const auto seq        = at::randint(1, n_keys, {batch_size}, int_options);
-        const double qk_scale = 1. / sqrt(dim_per_head);
-        auto call_ptr =
-            decltype(&efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront,
-                                                                      kWavefrontsPerBlock>){};
-
-#define SWITCH_CASE_SET_CALLPTR(n)                                                              \
-    case(n):                                                                                    \
-        call_ptr = &efficient_attention_forward_decoder_ck_out_impl<kThreadsPerWavefront, (n)>; \
+    const int32_t n_keys = std::stoi(args[0]);
+    const int32_t padding = std::stoi(args[1]);
+    const int32_t batch_size = std::stoi(args[2]);
+    const int32_t n_heads = std::stoi(args[3]);
+    const int32_t n_groups = 1;
+    const int32_t multiquery = (args[4] == "mq");
+    const auto dtype = (args[5] == "f32")
+        ? torch::kFloat32
+        : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const int32_t n_wavefronts_per_block = std::stoi(args[6]);
+
+    const int32_t dim_per_head = 4 * kThreadsPerWavefront;
+
+    const auto options = torch::TensorOptions()
+                             .dtype(dtype)
+                             .layout(torch::kStrided)
+                             .device(torch::kCUDA, 1)
+                             .requires_grad(false);
+
+    const auto int_options = options.dtype(torch::kInt);
+    const auto Q =
+        at::rand({batch_size, 1, n_groups, n_heads, dim_per_head}, options);
+    const auto K = multiquery
+        ? at::rand({batch_size, padding, n_groups, 1, dim_per_head}, options)
+              .expand({batch_size, padding, n_groups, n_heads, dim_per_head})
+        : at::rand(
+              {batch_size, padding, n_groups, n_heads, dim_per_head}, options);
+    const auto V = at::rand_like(K);
+    auto O = at::empty_like(Q);
+
+    const auto seq = at::randint(1, n_keys, {batch_size}, int_options);
+    const double qk_scale = 1. / sqrt(dim_per_head);
+    auto call_ptr = decltype(&efficient_attention_forward_decoder_ck_out_impl<
+                             kThreadsPerWavefront,
+                             kWavefrontsPerBlock>){};
+
+#define SWITCH_CASE_SET_CALLPTR(n)                               \
+  case (n):                                                      \
+    call_ptr = &efficient_attention_forward_decoder_ck_out_impl< \
+        kThreadsPerWavefront,                                    \
+        (n)>;                                                    \
+    break;
+
+    switch (n_wavefronts_per_block) {
+      SWITCH_CASE_SET_CALLPTR(1);
+      SWITCH_CASE_SET_CALLPTR(2);
+      SWITCH_CASE_SET_CALLPTR(4);
+      SWITCH_CASE_SET_CALLPTR(8);
+      SWITCH_CASE_SET_CALLPTR(16);
+
+      default:
+        call_ptr = nullptr;
         break;
-
-        switch(n_wavefronts_per_block)
-        {
-            SWITCH_CASE_SET_CALLPTR(1);
-            SWITCH_CASE_SET_CALLPTR(2);
-            SWITCH_CASE_SET_CALLPTR(4);
-            SWITCH_CASE_SET_CALLPTR(8);
-            SWITCH_CASE_SET_CALLPTR(16);
-
-        default: call_ptr = nullptr; break;
-        }
+    }
 #undef SWITCH_CASE_SET_CALLPTR
 
-        if(call_ptr)
-        {
-            call_ptr(Q, K, V, seq, qk_scale, O);
-        }
-        else
-        {
-            std::cout << "Warning: no kernel was found for wavefronts_per_block="
-                      << n_wavefronts_per_block << std::endl;
-        }
+    if (call_ptr) {
+      call_ptr(Q, K, V, seq, qk_scale, O);
+    } else {
+      std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                << n_wavefronts_per_block << std::endl;
     }
-    return 0;
+  }
+  return 0;
 }
 
 #endif // MAIN
\ No newline at end of file
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
index c4bbc72eb..5060b03c8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
@@ -24,10 +24,18 @@
 #include "ck_fmha_params.h"
 #include "ck_fmha_util.h"
 
-extern void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream);
-extern void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream);
-extern void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream);
-extern void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream);
+extern void batched_forward_fp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void batched_forward_bp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_fp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_bp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
 extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
@@ -41,10 +49,11 @@ namespace {
   (Mode BMHK) With all the heads having the same seqlen
   (Mode 1MHK) `batch=1` with all tokens across batches concatenated
 */
-std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward_ck(
-    const at::Tensor& query,               // [b, seqlen, num_heads_q, K]
-    const at::Tensor& key,                 // [b, seqlen, num_heads_kv, K]
-    const at::Tensor& value,               // [b, seqlen, num_heads_kv, Kv]
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
+efficient_attention_forward_ck(
+    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
+    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
+    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
     const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
@@ -59,380 +68,358 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
     int64_t custom_mask_type,
     c10::optional<double> scale,
     const c10::optional<at::Tensor>& seqlen_k,
-    const c10::optional<int64_t> window_size)
-{
-    std::ignore = window_size;
-
-    TORCH_CHECK(query.dim() == 4);
-    TORCH_CHECK(key.dim() == 4);
-    TORCH_CHECK(value.dim() == 4);
-
-    // Batch sizes
-    TORCH_CHECK(query.size(0) == key.size(0));
-    TORCH_CHECK(query.size(0) == value.size(0));
-
-    // Sequence length
-    TORCH_CHECK(key.size(1) == value.size(1));
-
-    // Num heads
-    TORCH_CHECK(query.size(2) % key.size(2) == 0);
-    TORCH_CHECK(key.size(2) == value.size(2));
-
-    // Embedding per head
-    TORCH_CHECK(query.size(3) == key.size(3));
-
-    TORCH_CHECK(query.scalar_type() == key.scalar_type());
-    TORCH_CHECK(query.scalar_type() == value.scalar_type());
-
-    TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-    if(seqstart_q.has_value())
-    {
-        TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-        TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-        TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-        CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-        TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-        TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-        TORCH_CHECK(max_seqlen_q_.has_value());
-    };
-
-    // last dim is contiguous, device is kCUDA
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-    // at::cuda::CUDAGuard device_guard(query.device());
-    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-    int64_t B   = query.size(0);
-    int64_t M   = query.size(1);
-    int64_t N   = key.size(1);
-    int64_t Hq  = query.size(-2);
-    int64_t Hkv = key.size(-2);
-    int64_t K   = query.size(-1);
-    int64_t Kv  = value.size(-1);
-
-    auto opts = query.options();
-
-    at::Tensor logsumexp;
-
-    at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
-
-    const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-    int64_t philox_seed;
-    int64_t philox_offset;
-
-    if(use_dropout)
-    {
-        at::PhiloxCudaState rng_engine_inputs;
-        at::CUDAGeneratorImpl* gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+    const c10::optional<int64_t> window_size) {
+  std::ignore = window_size;
+
+  TORCH_CHECK(query.dim() == 4);
+  TORCH_CHECK(key.dim() == 4);
+  TORCH_CHECK(value.dim() == 4);
+
+  // Batch sizes
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // Sequence length
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+
+  TORCH_CHECK(query.scalar_type() == key.scalar_type());
+  TORCH_CHECK(query.scalar_type() == value.scalar_type());
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+  };
+
+  // last dim is contiguous, device is kCUDA
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t Hq = query.size(-2);
+  int64_t Hkv = key.size(-2);
+  int64_t K = query.size(-1);
+  int64_t Kv = value.size(-1);
+
+  auto opts = query.options();
+
+  at::Tensor logsumexp;
+
+  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
+
+  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  if (use_dropout) {
+    at::PhiloxCudaState rng_engine_inputs;
+    at::CUDAGeneratorImpl* gen =
+        at::get_generator_or_default<at::CUDAGeneratorImpl>(
             c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
 
-        std::lock_guard<std::mutex> lock(gen->mutex_);
-        // if using dropout, we produce 1 random number for each element of the
-        // attention tensor
-        rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    // if using dropout, we produce 1 random number for each element of the
+    // attention tensor
+    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
+
+    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+    philox_seed = std::get<0>(seeds);
+    philox_offset = std::get<1>(seeds);
+  }
+
+  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
 
-        const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.custom_mask_type = custom_mask_type;
+
+    p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
+    p.compute_logsumexp = compute_logsumexp;
+
+    // the following parameters are only used by training forward
+    if (p.use_dropout)
+      p.dropout_prob = static_cast<float>(dropout_p);
+    else
+      p.dropout_prob = 0.0f;
+
+    if (p.compute_logsumexp) {
+      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
+      p.logsumexp_ptr = logsumexp.data_ptr();
+    } else
+      p.logsumexp_ptr = nullptr;
+  };
+
+  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
 
-        philox_seed   = std::get<0>(seeds);
-        philox_offset = std::get<1>(seeds);
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.custom_mask_type = custom_mask_type;
+
+    // max_seqlen_q is used to create logsumexp tensor
+    p.max_seqlen_q = *max_seqlen_q_;
+
+    p.host_seqstart_q.resize(p.num_batches + 1);
+    p.host_seqstart_k.resize(p.num_batches + 1);
+
+    for (int i = 0; i < p.host_seqstart_q.size(); i++)
+      p.host_seqstart_q[i] =
+          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
+
+    for (int i = 0; i < p.host_seqstart_k.size(); i++)
+      p.host_seqstart_k[i] =
+          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
+
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
+
+      p.host_seqlen_k.resize(p.num_batches);
+
+      for (int i = 0; i < p.host_seqlen_k.size(); i++)
+        p.host_seqlen_k[i] =
+            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
     }
 
-    auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-        p.B   = B;
-        p.M   = M;
-        p.N   = N;
-        p.Hq  = Hq;
-        p.Hkv = Hkv;
-        p.K   = K;
-        p.Kv  = Kv;
-
-        if(scale.has_value())
-        {
-            p.scale = float(*scale);
-        }
-        else
-        {
-            p.scale = float(1.0 / std::sqrt(float(K)));
-        }
-
-        p.q_ptr   = query.data_ptr();
-        p.k_ptr   = key.data_ptr();
-        p.v_ptr   = value.data_ptr();
-        p.out_ptr = out.data_ptr();
-
-        p.q_strides   = {static_cast<int>(query.stride(0)),
-                       static_cast<int>(query.stride(1)),
-                       static_cast<int>(query.stride(2)),
-                       static_cast<int>(query.stride(3))};
-        p.k_strides   = {static_cast<int>(key.stride(0)),
-                       static_cast<int>(key.stride(1)),
-                       static_cast<int>(key.stride(2)),
-                       static_cast<int>(key.stride(3))};
-        p.v_strides   = {static_cast<int>(value.stride(0)),
-                       static_cast<int>(value.stride(1)),
-                       static_cast<int>(value.stride(2)),
-                       static_cast<int>(value.stride(3))};
-        p.out_strides = {static_cast<int>(out.stride(0)),
-                         static_cast<int>(out.stride(1)),
-                         static_cast<int>(out.stride(2)),
-                         static_cast<int>(out.stride(3))};
-
-        if(bias.has_value())
-        {
-            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-            p.has_attn_bias = true;
-            p.attn_bias_ptr = bias->data_ptr();
-
-            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
-                                   static_cast<int>(bias_4d_view.stride(1)),
-                                   static_cast<int>(bias_4d_view.stride(2)),
-                                   static_cast<int>(bias_4d_view.stride(3))};
-        }
-        else
-            p.has_attn_bias = false;
-
-        p.custom_mask_type = custom_mask_type;
-
-        p.use_dropout       = use_dropout;
-        p.philox_seed       = philox_seed;
-        p.philox_offset     = philox_offset;
-        p.compute_logsumexp = compute_logsumexp;
-
-        // the following parameters are only used by training forward
-        if(p.use_dropout)
-            p.dropout_prob = static_cast<float>(dropout_p);
-        else
-            p.dropout_prob = 0.0f;
-
-        if(p.compute_logsumexp)
-        {
-            logsumexp       = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
-            p.logsumexp_ptr = logsumexp.data_ptr();
-        }
-        else
-            p.logsumexp_ptr = nullptr;
-    };
+    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
+    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
+    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
+
+    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
+    char* attn_bias_ptr =
+        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
+
+    for (int i = 0; i < p.num_batches; i++) {
+      size_t tmp_q_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
+          query.scalar_type());
+      size_t tmp_k_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
+          key.scalar_type());
+      size_t tmp_v_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
+          value.scalar_type());
+      size_t tmp_o_offset = get_size_in_bytes(
+          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
+          out.scalar_type());
+
+      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
+      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
+      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
+      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
+
+      if (bias.has_value()) {
+        size_t tmp_bias_offset = get_size_in_bytes(
+            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
+                static_cast<size_t>(p.host_seqstart_k[i]) *
+                    p.attn_bias_strides[3],
+            bias->scalar_type());
+
+        p.attn_bias_ptrs.push_back(
+            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
+      };
+
+      // ToDO: remove this after dev-op fix
+      p.randvals_ptrs.push_back(nullptr);
+    }
 
-    auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-        p.num_batches = seqstart_q->size(0) - 1;
-        p.M           = M;
-        p.N           = N;
-        p.Hq          = Hq;
-        p.Hkv         = Hkv;
-        p.K           = K;
-        p.Kv          = Kv;
-
-        if(scale.has_value())
-        {
-            p.scale = float(*scale);
-        }
-        else
-        {
-            p.scale = float(1.0 / std::sqrt(float(K)));
-        }
-
-        p.q_strides   = {static_cast<int>(query.stride(1)),
-                       static_cast<int>(query.stride(2)),
-                       static_cast<int>(query.stride(3))};
-        p.k_strides   = {static_cast<int>(key.stride(1)),
-                       static_cast<int>(key.stride(2)),
-                       static_cast<int>(key.stride(3))};
-        p.v_strides   = {static_cast<int>(value.stride(1)),
-                       static_cast<int>(value.stride(2)),
-                       static_cast<int>(value.stride(3))};
-        p.out_strides = {static_cast<int>(out.stride(1)),
-                         static_cast<int>(out.stride(2)),
-                         static_cast<int>(out.stride(3))};
-
-        if(bias.has_value())
-        {
-            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-            p.has_attn_bias               = true;
-            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
-                                   static_cast<int>(bias_4d_view.stride(1)),
-                                   static_cast<int>(bias_4d_view.stride(2)),
-                                   static_cast<int>(bias_4d_view.stride(3))};
-        }
-        else
-            p.has_attn_bias = false;
-
-        p.custom_mask_type = custom_mask_type;
-
-        // max_seqlen_q is used to create logsumexp tensor
-        p.max_seqlen_q = *max_seqlen_q_;
-
-        p.host_seqstart_q.resize(p.num_batches + 1);
-        p.host_seqstart_k.resize(p.num_batches + 1);
-
-        for(int i = 0; i < p.host_seqstart_q.size(); i++)
-            p.host_seqstart_q[i] = *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
-
-        for(int i = 0; i < p.host_seqstart_k.size(); i++)
-            p.host_seqstart_k[i] = *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
-
-        if(seqlen_k.has_value())
-        {
-            TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-            TORCH_CHECK(seqlen_k->dim() == 1);
-            TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-            CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
-
-            p.host_seqlen_k.resize(p.num_batches);
-
-            for(int i = 0; i < p.host_seqlen_k.size(); i++)
-                p.host_seqlen_k[i] = *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
-        }
-
-        char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-        char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-        char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-        char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-        char* attn_bias_ptr =
-            bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-        for(int i = 0; i < p.num_batches; i++)
-        {
-            size_t tmp_q_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0], query.scalar_type());
-            size_t tmp_k_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0], key.scalar_type());
-            size_t tmp_v_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0], value.scalar_type());
-            size_t tmp_o_offset = get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0], out.scalar_type());
-
-            p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-            p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-            p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-            p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-
-            if(bias.has_value())
-            {
-                size_t tmp_bias_offset = get_size_in_bytes(
-                    static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                        static_cast<size_t>(p.host_seqstart_k[i]) * p.attn_bias_strides[3],
-                    bias->scalar_type());
-
-                p.attn_bias_ptrs.push_back(
-                    reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-            };
-
-            // ToDO: remove this after dev-op fix
-            p.randvals_ptrs.push_back(nullptr);
-        }
-
-        p.use_dropout       = use_dropout;
-        p.philox_seed       = philox_seed;
-        p.philox_offset     = philox_offset;
-        p.compute_logsumexp = compute_logsumexp;
-
-        // the following parameters are only used by training forward
-        if(p.use_dropout)
-            p.dropout_prob = static_cast<float>(dropout_p);
-        else
-            p.dropout_prob = 0.0f;
-
-        if(p.compute_logsumexp)
-        {
-            logsumexp = at::empty({p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
-            char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-            for(int i = 0; i < p.num_batches; i++)
-            {
-                size_t tmp_logsumexp_offset = get_size_in_bytes(
-                    static_cast<size_t>(i) * Hq * p.max_seqlen_q, logsumexp.scalar_type());
-                p.logsumexp_ptrs.push_back(
-                    reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-            };
-        };
-    };
+    p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
+    p.compute_logsumexp = compute_logsumexp;
 
-    auto inDataType = query.scalar_type();
-
-    if(!seqstart_q.has_value())
-    { // input is batched
-        BatchedForwardParams batched_forward_params;
-
-        set_batched_forward_params(batched_forward_params);
-
-        if(!batched_forward_params.use_dropout && !batched_forward_params.compute_logsumexp)
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                batched_infer_fp16(batched_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                batched_infer_bp16(batched_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-        }
-        else
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                batched_forward_fp16(batched_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                batched_forward_bp16(batched_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-        };
-    }
+    // the following parameters are only used by training forward
+    if (p.use_dropout)
+      p.dropout_prob = static_cast<float>(dropout_p);
     else
-    { // input is grouped
-        GroupedForwardParams grouped_forward_params;
-
-        set_grouped_forward_params(grouped_forward_params);
-
-        if(!grouped_forward_params.use_dropout && !grouped_forward_params.compute_logsumexp)
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                grouped_infer_fp16(grouped_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                grouped_infer_bp16(grouped_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-        }
-        else
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                grouped_forward_fp16(grouped_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                grouped_forward_bp16(grouped_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-        };
+      p.dropout_prob = 0.0f;
+
+    if (p.compute_logsumexp) {
+      logsumexp = at::empty(
+          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+      char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
+
+      for (int i = 0; i < p.num_batches; i++) {
+        size_t tmp_logsumexp_offset = get_size_in_bytes(
+            static_cast<size_t>(i) * Hq * p.max_seqlen_q,
+            logsumexp.scalar_type());
+        p.logsumexp_ptrs.push_back(
+            reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
+      };
+    };
+  };
+
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedForwardParams batched_forward_params;
+
+    set_batched_forward_params(batched_forward_params);
+
+    if (!batched_forward_params.use_dropout &&
+        !batched_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        batched_infer_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_infer_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      if (inDataType == at::ScalarType::Half) {
+        batched_forward_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_forward_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    };
+  } else { // input is grouped
+    GroupedForwardParams grouped_forward_params;
+
+    set_grouped_forward_params(grouped_forward_params);
+
+    if (!grouped_forward_params.use_dropout &&
+        !grouped_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_infer_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_infer_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_forward_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_forward_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
     };
+  };
 
-    return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
+  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m)
-{
-    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
-           TORCH_FN(efficient_attention_forward_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
+      TORCH_FN(efficient_attention_forward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 9db1cd257..a56b87f73 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -21,10 +21,18 @@
 #include "ck_fmha_util.h"
 #include "ck_tiled_fmha_params.h"
 
-extern void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream);
-extern void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream);
-extern void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream);
-extern void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream);
+extern void batched_forward_fp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void batched_forward_bp16(
+    BatchedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_fp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
+extern void grouped_forward_bp16(
+    GroupedForwardParams& param,
+    hipStream_t stream);
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
 extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
@@ -38,10 +46,11 @@ namespace {
   (Mode BMHK) With all the heads having the same seqlen
   (Mode 1MHK) `batch=1` with all tokens across batches concatenated
 */
-std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward_ck(
-    const at::Tensor& query,               // [b, seqlen, num_heads_q, K]
-    const at::Tensor& key,                 // [b, seqlen, num_heads_kv, K]
-    const at::Tensor& value,               // [b, seqlen, num_heads_kv, Kv]
+std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
+efficient_attention_forward_ck(
+    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
+    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
+    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
     const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
@@ -56,390 +65,357 @@ std::tuple<at::Tensor, at::Tensor, int64_t, int64_t> efficient_attention_forward
     int64_t custom_mask_type,
     c10::optional<double> scale,
     const c10::optional<at::Tensor>& seqlen_k,
-    const c10::optional<int64_t> window_size)
-{
-    TORCH_CHECK(query.dim() == 4);
-    TORCH_CHECK(key.dim() == 4);
-    TORCH_CHECK(value.dim() == 4);
-
-    // Batch sizes
-    TORCH_CHECK(query.size(0) == key.size(0));
-    TORCH_CHECK(query.size(0) == value.size(0));
-
-    // Sequence length
-    TORCH_CHECK(key.size(1) == value.size(1));
-
-    // Num heads
-    TORCH_CHECK(query.size(2) % key.size(2) == 0);
-    TORCH_CHECK(key.size(2) == value.size(2));
-
-    // Embedding per head
-    TORCH_CHECK(query.size(3) == key.size(3));
-
-    TORCH_CHECK(query.scalar_type() == key.scalar_type());
-    TORCH_CHECK(query.scalar_type() == value.scalar_type());
-
-    TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-    if(seqstart_q.has_value())
-    {
-        TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-        TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-        TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-        TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-        TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-        TORCH_CHECK(max_seqlen_q_.has_value());
-    };
-
-    // last dim is contiguous, device is kCUDA
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-    CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-    // at::cuda::CUDAGuard device_guard(query.device());
-    hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-    int64_t B   = query.size(0);
-    int64_t M   = query.size(1);
-    int64_t N   = key.size(1);
-    int64_t Hq  = query.size(-2);
-    int64_t Hkv = key.size(-2);
-    int64_t K   = query.size(-1);
-    int64_t Kv  = value.size(-1);
-
-    auto opts = query.options();
-
-    at::Tensor logsumexp;
-
-    at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
-
-    const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-    int64_t philox_seed;
-    int64_t philox_offset;
-
-    if(use_dropout)
-    {
-        /*
-        at::PhiloxCudaState rng_engine_inputs;
-        at::CUDAGeneratorImpl* gen =
-            at::get_generator_or_default<at::CUDAGeneratorImpl>(
-                c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-        std::lock_guard<std::mutex> lock(gen->mutex_);
-        // if using dropout, we produce 1 random number for each element of the
-        // attention tensor
-        rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
-
-        const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-        philox_seed = std::get<0>(seeds);
-        philox_offset = std::get<1>(seeds);
-        */
-        throw std::runtime_error("drop-out is currently not implemented by ck-tiled!");
+    const c10::optional<int64_t> window_size) {
+  TORCH_CHECK(query.dim() == 4);
+  TORCH_CHECK(key.dim() == 4);
+  TORCH_CHECK(value.dim() == 4);
+
+  // Batch sizes
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // Sequence length
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+
+  TORCH_CHECK(query.scalar_type() == key.scalar_type());
+  TORCH_CHECK(query.scalar_type() == value.scalar_type());
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+  };
+
+  // last dim is contiguous, device is kCUDA
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t Hq = query.size(-2);
+  int64_t Hkv = key.size(-2);
+  int64_t K = query.size(-1);
+  int64_t Kv = value.size(-1);
+
+  auto opts = query.options();
+
+  at::Tensor logsumexp;
+
+  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
+
+  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  if (use_dropout) {
+    /*
+    at::PhiloxCudaState rng_engine_inputs;
+    at::CUDAGeneratorImpl* gen =
+        at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    // if using dropout, we produce 1 random number for each element of the
+    // attention tensor
+    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
+
+    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+    philox_seed = std::get<0>(seeds);
+    philox_offset = std::get<1>(seeds);
+    */
+    throw std::runtime_error(
+        "drop-out is currently not implemented by ck-tiled!");
+  }
+
+  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
     }
 
-    auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-        p.B   = B;
-        p.M   = M;
-        p.N   = N;
-        p.Hq  = Hq;
-        p.Hkv = Hkv;
-        p.K   = K;
-        p.Kv  = Kv;
-
-        if(scale.has_value())
-        {
-            p.scale = float(*scale);
-        }
-        else
-        {
-            p.scale = float(1.0 / std::sqrt(float(K)));
-        }
-
-        p.q_ptr   = query.data_ptr();
-        p.k_ptr   = key.data_ptr();
-        p.v_ptr   = value.data_ptr();
-        p.out_ptr = out.data_ptr();
-
-        p.q_strides   = {static_cast<int>(query.stride(0)),
-                       static_cast<int>(query.stride(1)),
-                       static_cast<int>(query.stride(2)),
-                       static_cast<int>(query.stride(3))};
-        p.k_strides   = {static_cast<int>(key.stride(0)),
-                       static_cast<int>(key.stride(1)),
-                       static_cast<int>(key.stride(2)),
-                       static_cast<int>(key.stride(3))};
-        p.v_strides   = {static_cast<int>(value.stride(0)),
-                       static_cast<int>(value.stride(1)),
-                       static_cast<int>(value.stride(2)),
-                       static_cast<int>(value.stride(3))};
-        p.out_strides = {static_cast<int>(out.stride(0)),
-                         static_cast<int>(out.stride(1)),
-                         static_cast<int>(out.stride(2)),
-                         static_cast<int>(out.stride(3))};
-
-        if(bias.has_value())
-        {
-            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-            p.has_attn_bias = true;
-            p.attn_bias_ptr = bias->data_ptr();
-
-            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
-                                   static_cast<int>(bias_4d_view.stride(1)),
-                                   static_cast<int>(bias_4d_view.stride(2)),
-                                   static_cast<int>(bias_4d_view.stride(3))};
-        }
-        else
-            p.has_attn_bias = false;
-
-        p.custom_mask_type = custom_mask_type;
-        p.window_size      = window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
-
-        p.use_dropout       = use_dropout;
-        p.philox_seed       = philox_seed;
-        p.philox_offset     = philox_offset;
-        p.compute_logsumexp = compute_logsumexp;
-
-        // the following parameters are only used by training forward
-        if(p.use_dropout)
-        {
-            // p.dropout_prob = static_cast<float>(dropout_p);
-            throw std::runtime_error("drop-out is currently not implemented by ck-tiled!");
-        }
-        else
-            p.dropout_prob = 0.0f;
-
-        if(p.compute_logsumexp)
-        {
-            logsumexp       = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
-            p.logsumexp_ptr = logsumexp.data_ptr();
-        }
-        else
-            p.logsumexp_ptr = nullptr;
-    };
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.custom_mask_type = custom_mask_type;
+    p.window_size =
+        window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
+
+    p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
+    p.compute_logsumexp = compute_logsumexp;
+
+    // the following parameters are only used by training forward
+    if (p.use_dropout) {
+      // p.dropout_prob = static_cast<float>(dropout_p);
+      throw std::runtime_error(
+          "drop-out is currently not implemented by ck-tiled!");
+    } else
+      p.dropout_prob = 0.0f;
+
+    if (p.compute_logsumexp) {
+      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
+      p.logsumexp_ptr = logsumexp.data_ptr();
+    } else
+      p.logsumexp_ptr = nullptr;
+  };
+
+  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
 
-    auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-        p.num_batches = seqstart_q->size(0) - 1;
-        p.M           = M;
-        p.N           = N;
-        p.Hq          = Hq;
-        p.Hkv         = Hkv;
-        p.K           = K;
-        p.Kv          = Kv;
-
-        if(scale.has_value())
-        {
-            p.scale = float(*scale);
-        }
-        else
-        {
-            p.scale = float(1.0 / std::sqrt(float(K)));
-        }
-
-        p.q_ptr   = query.data_ptr();
-        p.k_ptr   = key.data_ptr();
-        p.v_ptr   = value.data_ptr();
-        p.out_ptr = out.data_ptr();
-
-        p.q_strides   = {static_cast<int>(query.stride(1)),
-                       static_cast<int>(query.stride(2)),
-                       static_cast<int>(query.stride(3))};
-        p.k_strides   = {static_cast<int>(key.stride(1)),
-                       static_cast<int>(key.stride(2)),
-                       static_cast<int>(key.stride(3))};
-        p.v_strides   = {static_cast<int>(value.stride(1)),
-                       static_cast<int>(value.stride(2)),
-                       static_cast<int>(value.stride(3))};
-        p.out_strides = {static_cast<int>(out.stride(1)),
-                         static_cast<int>(out.stride(2)),
-                         static_cast<int>(out.stride(3))};
-
-        if(bias.has_value())
-        {
-            CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-            TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-            p.has_attn_bias = true;
-            p.attn_bias_ptr = bias->data_ptr();
-
-            const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-            p.attn_bias_strides           = {static_cast<int>(bias_4d_view.stride(0)),
-                                   static_cast<int>(bias_4d_view.stride(1)),
-                                   static_cast<int>(bias_4d_view.stride(2)),
-                                   static_cast<int>(bias_4d_view.stride(3))};
-        }
-        else
-            p.has_attn_bias = false;
-
-        p.custom_mask_type = custom_mask_type;
-        p.window_size      = window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
-
-        // max_seqlen_q is used to create logsumexp tensor
-        p.max_seqlen_q = *max_seqlen_q_;
-
-        // interesting: the tensors have to be defined here, moving to more local scope will
-        // cause issue
-        at::Tensor dev_seqstart_q;
-        at::Tensor dev_seqstart_k;
-        at::Tensor dev_seqlen_k;
-
-        if(seqstart_q->is_cpu())
-        {
-            dev_seqstart_q       = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-            p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
-            HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_q_dev_ptr,
-                                          seqstart_q->data_ptr(),
-                                          (p.num_batches + 1) * sizeof(int),
-                                          hipMemcpyHostToDevice,
-                                          stream));
-        }
-        else
-            p.seqstart_q_dev_ptr = seqstart_q->data_ptr();
-
-        if(seqstart_k->is_cpu())
-        {
-            dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-
-            p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
-            HIP_CALL_CHECK(hipMemcpyAsync(p.seqstart_k_dev_ptr,
-                                          seqstart_k->data_ptr(),
-                                          (p.num_batches + 1) * sizeof(int),
-                                          hipMemcpyHostToDevice,
-                                          stream));
-        }
-        else
-            p.seqstart_k_dev_ptr = seqstart_k->data_ptr();
-
-        if(seqlen_k.has_value())
-        {
-            TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-            TORCH_CHECK(seqlen_k->dim() == 1);
-            TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-
-            if(seqlen_k->is_cpu())
-            {
-                dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
-
-                p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
-                HIP_CALL_CHECK(hipMemcpyAsync(p.seqlen_k_dev_ptr,
-                                              seqlen_k->data_ptr(),
-                                              p.num_batches * sizeof(int),
-                                              hipMemcpyHostToDevice,
-                                              stream));
-            }
-            else
-                p.seqlen_k_dev_ptr = seqlen_k->data_ptr();
-        }
-        else
-            p.seqlen_k_dev_ptr = nullptr;
-
-        p.use_dropout       = use_dropout;
-        p.philox_seed       = philox_seed;
-        p.philox_offset     = philox_offset;
-        p.compute_logsumexp = compute_logsumexp;
-
-        // the following parameters are only used by training forward
-        if(p.use_dropout)
-        {
-            // p.dropout_prob = static_cast<float>(dropout_p);
-            throw std::runtime_error("drop-out is currently not implemented by ck-tiled!");
-        }
-        else
-            p.dropout_prob = 0.0f;
-
-        if(p.compute_logsumexp)
-        {
-            logsumexp = at::empty({p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
-            p.logsumexp_ptr = logsumexp.data_ptr();
-        }
-        else
-            p.logsumexp_ptr = nullptr;
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.custom_mask_type = custom_mask_type;
+    p.window_size =
+        window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
+
+    // max_seqlen_q is used to create logsumexp tensor
+    p.max_seqlen_q = *max_seqlen_q_;
+
+    // interesting: the tensors have to be defined here, moving to more local
+    // scope will cause issue
+    at::Tensor dev_seqstart_q;
+    at::Tensor dev_seqstart_k;
+    at::Tensor dev_seqlen_k;
+
+    if (seqstart_q->is_cpu()) {
+      dev_seqstart_q = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+      p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
+      HIP_CALL_CHECK(hipMemcpyAsync(
+          p.seqstart_q_dev_ptr,
+          seqstart_q->data_ptr(),
+          (p.num_batches + 1) * sizeof(int),
+          hipMemcpyHostToDevice,
+          stream));
+    } else
+      p.seqstart_q_dev_ptr = seqstart_q->data_ptr();
+
+    if (seqstart_k->is_cpu()) {
+      dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+
+      p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
+      HIP_CALL_CHECK(hipMemcpyAsync(
+          p.seqstart_k_dev_ptr,
+          seqstart_k->data_ptr(),
+          (p.num_batches + 1) * sizeof(int),
+          hipMemcpyHostToDevice,
+          stream));
+    } else
+      p.seqstart_k_dev_ptr = seqstart_k->data_ptr();
+
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+
+      if (seqlen_k->is_cpu()) {
+        dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
+
+        p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
+        HIP_CALL_CHECK(hipMemcpyAsync(
+            p.seqlen_k_dev_ptr,
+            seqlen_k->data_ptr(),
+            p.num_batches * sizeof(int),
+            hipMemcpyHostToDevice,
+            stream));
+      } else
+        p.seqlen_k_dev_ptr = seqlen_k->data_ptr();
+    } else
+      p.seqlen_k_dev_ptr = nullptr;
+
+    p.use_dropout = use_dropout;
+    p.philox_seed = philox_seed;
+    p.philox_offset = philox_offset;
+    p.compute_logsumexp = compute_logsumexp;
+
+    // the following parameters are only used by training forward
+    if (p.use_dropout) {
+      // p.dropout_prob = static_cast<float>(dropout_p);
+      throw std::runtime_error(
+          "drop-out is currently not implemented by ck-tiled!");
+    } else
+      p.dropout_prob = 0.0f;
+
+    if (p.compute_logsumexp) {
+      logsumexp = at::empty(
+          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+      p.logsumexp_ptr = logsumexp.data_ptr();
+    } else
+      p.logsumexp_ptr = nullptr;
+  };
+
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedForwardParams batched_forward_params;
+
+    set_batched_forward_params(batched_forward_params);
+
+    if (!batched_forward_params.use_dropout &&
+        !batched_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        batched_infer_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_infer_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      if (inDataType == at::ScalarType::Half) {
+        batched_forward_fp16(batched_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        batched_forward_bp16(batched_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+
+      throw std::runtime_error(
+          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
     };
-
-    auto inDataType = query.scalar_type();
-
-    if(!seqstart_q.has_value())
-    { // input is batched
-        BatchedForwardParams batched_forward_params;
-
-        set_batched_forward_params(batched_forward_params);
-
-        if(!batched_forward_params.use_dropout && !batched_forward_params.compute_logsumexp)
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                batched_infer_fp16(batched_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                batched_infer_bp16(batched_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-        }
-        else
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                batched_forward_fp16(batched_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                batched_forward_bp16(batched_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-
-            throw std::runtime_error(
-                "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
-        };
-    }
-    else
-    { // input is grouped
-        GroupedForwardParams grouped_forward_params;
-
-        set_grouped_forward_params(grouped_forward_params);
-
-        if(!grouped_forward_params.use_dropout && !grouped_forward_params.compute_logsumexp)
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                grouped_infer_fp16(grouped_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                grouped_infer_bp16(grouped_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-        }
-        else
-        {
-            if(inDataType == at::ScalarType::Half)
-            {
-                grouped_forward_fp16(grouped_forward_params, stream);
-            }
-            else if(inDataType == at::ScalarType::BFloat16)
-            {
-                grouped_forward_bp16(grouped_forward_params, stream);
-            }
-            else
-                throw std::runtime_error("input data-type is not supported!");
-
-            throw std::runtime_error(
-                "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
-        };
+  } else { // input is grouped
+    GroupedForwardParams grouped_forward_params;
+
+    set_grouped_forward_params(grouped_forward_params);
+
+    if (!grouped_forward_params.use_dropout &&
+        !grouped_forward_params.compute_logsumexp) {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_infer_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_infer_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+    } else {
+      if (inDataType == at::ScalarType::Half) {
+        grouped_forward_fp16(grouped_forward_params, stream);
+      } else if (inDataType == at::ScalarType::BFloat16) {
+        grouped_forward_bp16(grouped_forward_params, stream);
+      } else
+        throw std::runtime_error("input data-type is not supported!");
+
+      throw std::runtime_error(
+          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
     };
+  };
 
-    return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
+  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
 }
 
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m)
-{
-    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
-           TORCH_FN(efficient_attention_forward_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
+      TORCH_FN(efficient_attention_forward_ck));
 }
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 833b152eb..a7ddb148c 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -8,8 +8,8 @@
 
 namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock  = 16;
-constexpr int32_t K_MAX                = 4 * kThreadsPerWavefront;
+constexpr int32_t kWavefrontsPerBlock = 16;
+constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
 } // namespace
 
 namespace {
@@ -17,195 +17,216 @@ namespace {
 template <typename c10_t>
 struct c10_to_data_t;
 template <>
-struct c10_to_data_t<float>
-{
-    using type = float;
+struct c10_to_data_t<float> {
+  using type = float;
 };
 
 template <>
-struct c10_to_data_t<c10::Half>
-{
-    using type = ck::half_t;
+struct c10_to_data_t<c10::Half> {
+  using type = ck::half_t;
 };
 
 template <>
-struct c10_to_data_t<c10::BFloat16>
-{
-    using type = ck::bhalf_t;
+struct c10_to_data_t<c10::BFloat16> {
+  using type = ck::bhalf_t;
 };
 } // namespace
 
 #define AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
-    AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
 
-#define AT_DISPATCH_SWITCH_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
-    AT_DISPATCH_SWITCH(                                                              \
-        TYPE, NAME, AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+#define AT_DISPATCH_SWITCH_3(                               \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
 
 namespace {
 
-template <int32_t ThreadsPerWavefront,
-          int32_t WavefrontsPerBlock,
-          int32_t KV_M_MAX = 8192,
-          int32_t K_MAX    = 256>
+template <
+    int32_t ThreadsPerWavefront,
+    int32_t WavefrontsPerBlock,
+    int32_t KV_M_MAX = 8192,
+    int32_t K_MAX = 256>
 at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int64_t split_k,
     at::Tensor& split_max,
     at::Tensor& split_sumexp,
     at::Tensor& split_O,
-    at::Tensor& O)
-{
-    static_assert(4 * ThreadsPerWavefront == K_MAX, "");
-    static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
-
-    at::OptionalDeviceGuard guard(XQ.device());
-    TORCH_CHECK(XQ.is_cuda());
-    TORCH_CHECK(cache_K.is_cuda());
-    TORCH_CHECK(cache_V.is_cuda());
-
-    TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
-
-    TORCH_CHECK(cache_K.size(1) / split_k <= KV_M_MAX);
-    TORCH_CHECK(cache_K.size(4) <= K_MAX);
-
-    constexpr auto rank = 5;
-
-    auto B = XQ.size(0);
-    auto M = XQ.size(1);
-    auto G = XQ.size(2);
-    auto H = XQ.size(3);
-
-    TORCH_CHECK(B <= 1024);
-    TORCH_CHECK(M <= 1024);
-    TORCH_CHECK(H <= 1024);
-
-    dim3 blocks(B * H * M * G, split_k);
-    dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
-
-    int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-    int32_t smem_output  = K_MAX * sizeof(float) *
-                          threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-    const size_t lds_bytes = max(smem_softmax, smem_output);
-    auto stream            = at::cuda::getCurrentHIPStream().stream();
-
-    AT_DISPATCH_SWITCH_3(
-        at::ScalarType::Half,
-        at::ScalarType::BFloat16,
-        at::ScalarType::Float,
-        XQ.scalar_type(),
-        "efficient_attention_forward_decoder_splitk_ck",
-        [&] {
-            using ck_data_t   = c10_to_data_t<scalar_t>::type;
-            using device_op_t = ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t>;
-            auto op           = device_op_t{};
-
-            auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto K_acc  = cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-            auto V_acc  = cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-            auto split_O_acc =
-                split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-            auto O_acc = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto seq_acc_ptr =
-                seq_kv_lens
-                    ? seq_kv_lens->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>().data()
-                    : nullptr;
-            auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-            auto split_sumexp_acc =
-                split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-            auto arg = device_op_t::Argument(
-                reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
-                reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
-                reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
-                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-                reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
-                split_max_acc.data(),
-                split_sumexp_acc.data(),
-                seq_acc_ptr,
-                XQ_acc.stride(0),
-                XQ_acc.stride(1),
-                XQ_acc.stride(2),
-                XQ_acc.stride(3),
-                K_acc.stride(0),
-                K_acc.stride(1),
-                K_acc.stride(2),
-                K_acc.stride(3),
-                split_O_acc.stride(0),
-                XQ_acc.size(1),
-                XQ_acc.size(2),
-                XQ_acc.size(3),
-                XQ_acc.size(4),
-                K_acc.size(1),
-                K_acc.size(3) == 1,
-                qk_scale,
-                split_k,
-                blocks,
-                threads,
-                lds_bytes);
-
-            auto invoker = device_op_t::Invoker{};
-            (void)invoker.Run(arg, {stream});
-        });
-
-    return O;
+    at::Tensor& O) {
+  static_assert(4 * ThreadsPerWavefront == K_MAX, "");
+  static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
+
+  at::OptionalDeviceGuard guard(XQ.device());
+  TORCH_CHECK(XQ.is_cuda());
+  TORCH_CHECK(cache_K.is_cuda());
+  TORCH_CHECK(cache_V.is_cuda());
+
+  TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
+
+  TORCH_CHECK(cache_K.size(1) / split_k <= KV_M_MAX);
+  TORCH_CHECK(cache_K.size(4) <= K_MAX);
+
+  constexpr auto rank = 5;
+
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+
+  TORCH_CHECK(B <= 1024);
+  TORCH_CHECK(M <= 1024);
+  TORCH_CHECK(H <= 1024);
+
+  dim3 blocks(B * H * M * G, split_k);
+  dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
+
+  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = K_MAX * sizeof(float) *
+      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  const size_t lds_bytes = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      XQ.scalar_type(),
+      "efficient_attention_forward_decoder_splitk_ck",
+      [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t>;
+        auto op = device_op_t{};
+
+        auto XQ_acc =
+            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto K_acc =
+            cache_K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto V_acc =
+            cache_V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto split_O_acc =
+            split_O
+                .packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+        auto O_acc =
+            O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto seq_acc_ptr = seq_kv_lens
+            ? seq_kv_lens
+                  ->packed_accessor32<int32_t, 1, at::RestrictPtrTraits>()
+                  .data()
+            : nullptr;
+        auto split_max_acc =
+            split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto split_sumexp_acc =
+            split_sumexp
+                .packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto arg = device_op_t::Argument(
+            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
+            split_max_acc.data(),
+            split_sumexp_acc.data(),
+            seq_acc_ptr,
+            XQ_acc.stride(0),
+            XQ_acc.stride(1),
+            XQ_acc.stride(2),
+            XQ_acc.stride(3),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.stride(3),
+            split_O_acc.stride(0),
+            XQ_acc.size(1),
+            XQ_acc.size(2),
+            XQ_acc.size(3),
+            XQ_acc.size(4),
+            K_acc.size(1),
+            K_acc.size(3) == 1,
+            qk_scale,
+            split_k,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
+      });
+
+  return O;
 }
 
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, H or 1, D]
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
-    int64_t split_k)
-{
-    auto O              = at::empty_like(XQ);
-    constexpr auto rank = 5;
-
-    TORCH_CHECK(XQ.dim() == rank);
-    TORCH_CHECK(cache_K.dim() == rank);
-    TORCH_CHECK(cache_V.dim() == rank);
-
-    auto B = XQ.size(0);
-    auto M = XQ.size(1);
-    auto G = XQ.size(2);
-    auto H = XQ.size(3);
-    auto K = XQ.size(4);
-
-    auto O_splits     = at::empty({split_k, B, M, G, H, K}, XQ.options());
-    auto split_max    = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
-    auto split_sumexp = at::empty_like(split_max);
-
-    efficient_attention_forward_decoder_splitk_ck_out_impl<ThreadsPerWavefront, WavefrontsPerBlock>(
-        XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
-
-    return O;
+    int64_t split_k) {
+  auto O = at::empty_like(XQ);
+  constexpr auto rank = 5;
+
+  TORCH_CHECK(XQ.dim() == rank);
+  TORCH_CHECK(cache_K.dim() == rank);
+  TORCH_CHECK(cache_V.dim() == rank);
+
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+  auto K = XQ.size(4);
+
+  auto O_splits = at::empty({split_k, B, M, G, H, K}, XQ.options());
+  auto split_max =
+      at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat));
+  auto split_sumexp = at::empty_like(split_max);
+
+  efficient_attention_forward_decoder_splitk_ck_out_impl<
+      ThreadsPerWavefront,
+      WavefrontsPerBlock>(
+      XQ,
+      cache_K,
+      cache_V,
+      seq_kv_lens,
+      qk_scale,
+      split_k,
+      split_max,
+      split_sumexp,
+      O_splits,
+      O);
+
+  return O;
 }
 
 at::Tensor efficient_attention_forward_decoder_splitk_ck(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
-    int64_t split_k)
-{
-    return efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront,
-                                                              kWavefrontsPerBlock>(
-        XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
+    int64_t split_k) {
+  return efficient_attention_forward_decoder_splitk_ck_impl<
+      kThreadsPerWavefront,
+      kWavefrontsPerBlock>(
+      XQ, cache_K, cache_V, seq_kv_lens, qk_scale, split_k);
 }
 } // namespace
 
-TORCH_LIBRARY_IMPL(xformers, CUDA, m)
-{
-    m.impl(TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_decoder_splitk_ck"),
-           TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME(
+          "xformers::efficient_attention_forward_decoder_splitk_ck"),
+      TORCH_FN(efficient_attention_forward_decoder_splitk_ck));
 }
 
 #ifdef ATTN_FWD_SPLITK_DECODER_MAIN
@@ -241,120 +262,119 @@ TORCH_LIBRARY_IMPL(xformers, CUDA, m)
 
 // clang-format on
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor>
-split_attention_torch(const at::Tensor& Q,
-                      const at::Tensor& K,
-                      const at::Tensor& V,
-                      const at::Tensor& k_seqlens,
-                      const int32_t split_k,
-                      const int32_t block_size)
-{
-    auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
-
-    std::vector<at::Tensor> O_splits;
-    std::vector<at::Tensor> m_splits;
-    std::vector<at::Tensor> l_splits;
-
-    for(int32_t split_idx = 0; split_idx < split_k; ++split_idx)
-    {
-        std::vector<at::Tensor> O_batch;
-        std::vector<at::Tensor> m_batch;
-        std::vector<at::Tensor> l_batch;
-
-        for(size_t b = 0; b < k_seqlens.numel(); ++b)
-        {
-            auto seqlen         = k_seqlens[b].item<int64_t>();
-            const int64_t t_low = split_idx * (seqlen / split_k / block_size) * block_size;
-            const int64_t t_high =
-                (split_idx + 1 < split_k)
-                    ? (1 + split_idx) * (seqlen / split_k / block_size) * block_size
-                    : seqlen;
-
-            const bool empty = t_low == t_high;
-
-            auto S = at::einsum(
-                "mghk, nghk -> mghn",
-                {Q_scaled[b], at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
-                /* einsum eval path */ at::nullopt);
-            auto m = empty ? at::empty_like(S)
-                           : std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
-            auto s = at::exp(at::sub(S, m));
-            auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
-            auto O = at::einsum("mghn, nghk -> mghk",
-                                {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
-                                /* einsum eval path */ at::nullopt);
-            if(empty)
-            {
-                m = at::empty_like(at::slice(O, -1, 0, 1));
-                l = at::zeros_like(m);
-                m.fill_(ck::NumericLimits<float>::Lowest());
-            }
-            O_batch.push_back(O);
-            m_batch.push_back(m);
-            l_batch.push_back(l);
-        }
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_torch(
+    const at::Tensor& Q,
+    const at::Tensor& K,
+    const at::Tensor& V,
+    const at::Tensor& k_seqlens,
+    const int32_t split_k,
+    const int32_t block_size) {
+  auto Q_scaled = at::div(Q, sqrt(Q.size(-1)));
+
+  std::vector<at::Tensor> O_splits;
+  std::vector<at::Tensor> m_splits;
+  std::vector<at::Tensor> l_splits;
+
+  for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+    std::vector<at::Tensor> O_batch;
+    std::vector<at::Tensor> m_batch;
+    std::vector<at::Tensor> l_batch;
+
+    for (size_t b = 0; b < k_seqlens.numel(); ++b) {
+      auto seqlen = k_seqlens[b].item<int64_t>();
+      const int64_t t_low =
+          split_idx * (seqlen / split_k / block_size) * block_size;
+      const int64_t t_high = (split_idx + 1 < split_k)
+          ? (1 + split_idx) * (seqlen / split_k / block_size) * block_size
+          : seqlen;
+
+      const bool empty = t_low == t_high;
+
+      auto S = at::einsum(
+          "mghk, nghk -> mghn",
+          {Q_scaled[b],
+           at::slice(K[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
+          /* einsum eval path */ at::nullopt);
+      auto m = empty
+          ? at::empty_like(S)
+          : std::get<0>(at::max(S, /* dim */ -1, /* keepdim */ true));
+      auto s = at::exp(at::sub(S, m));
+      auto l = at::sum(s, /* dim */ -1, /* keepdim */ true);
+      auto O = at::einsum(
+          "mghn, nghk -> mghk",
+          {s, at::slice(V[b], /*dim*/ 0, /*start*/ t_low, /*end*/ t_high)},
+          /* einsum eval path */ at::nullopt);
+      if (empty) {
+        m = at::empty_like(at::slice(O, -1, 0, 1));
+        l = at::zeros_like(m);
+        m.fill_(ck::NumericLimits<float>::Lowest());
+      }
+      O_batch.push_back(O);
+      m_batch.push_back(m);
+      l_batch.push_back(l);
+    }
 
-        auto O_cat = at::stack(O_batch);
-        auto m_cat = at::stack(m_batch);
-        auto l_cat = at::stack(l_batch);
+    auto O_cat = at::stack(O_batch);
+    auto m_cat = at::stack(m_batch);
+    auto l_cat = at::stack(l_batch);
 
-        O_splits.push_back(O_cat);
-        m_splits.push_back(m_cat);
-        l_splits.push_back(l_cat);
-    }
+    O_splits.push_back(O_cat);
+    m_splits.push_back(m_cat);
+    l_splits.push_back(l_cat);
+  }
 
-    auto O_cat = at::stack(O_splits);
-    auto m_cat = at::transpose(at::stack(m_splits), 0, -1);
-    auto l_cat = at::transpose(at::stack(l_splits), 0, -1);
+  auto O_cat = at::stack(O_splits);
+  auto m_cat = at::transpose(at::stack(m_splits), 0, -1);
+  auto l_cat = at::transpose(at::stack(l_splits), 0, -1);
 
-    return std::make_tuple(O_cat, m_cat, l_cat);
+  return std::make_tuple(O_cat, m_cat, l_cat);
 }
 
-static at::Tensor split_reduce_torch(const at::Tensor& O_splits,
-                                     const at::Tensor& m_splits,
-                                     const at::Tensor& l_splits,
-                                     int32_t split_k)
-{
-    auto O             = at::zeros_like(at::slice(O_splits, 0, 0, 1));
-    auto global_max    = at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
-    auto global_sumexp = at::zeros_like(global_max);
-
-    for(int32_t split_idx = 0; split_idx < split_k; ++split_idx)
-    {
-        auto local_O      = at::slice(O_splits, 0, split_idx, split_idx + 1);
-        auto local_max    = at::slice(m_splits, -1, split_idx, split_idx + 1);
-        auto local_sumexp = at::slice(l_splits, -1, split_idx, split_idx + 1);
-
-        auto log_alpha = at::neg(at::abs(at::sub(local_max, global_max)));
-        auto alpha     = at::exp(log_alpha);
-        alpha.nan_to_num_(1.);
-
-        auto pick_new          = at::less(local_max, global_max);
-        auto pick_current_coef = at::where(pick_new, 1., alpha);
-        auto pick_new_coef     = at::where(pick_new, alpha, 1.);
-
-        O             = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
-        global_sumexp = at::add(at::mul(pick_current_coef, global_sumexp),
-                                at::mul(pick_new_coef, local_sumexp));
-        global_max    = at::max(local_max, global_max);
-    }
-
-    return at::div(O, global_sumexp);
+static at::Tensor split_reduce_torch(
+    const at::Tensor& O_splits,
+    const at::Tensor& m_splits,
+    const at::Tensor& l_splits,
+    int32_t split_k) {
+  auto O = at::zeros_like(at::slice(O_splits, 0, 0, 1));
+  auto global_max =
+      at::empty_like(at::slice(m_splits, -1, 0, 1)).fill_(-65535.);
+  auto global_sumexp = at::zeros_like(global_max);
+
+  for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+    auto local_O = at::slice(O_splits, 0, split_idx, split_idx + 1);
+    auto local_max = at::slice(m_splits, -1, split_idx, split_idx + 1);
+    auto local_sumexp = at::slice(l_splits, -1, split_idx, split_idx + 1);
+
+    auto log_alpha = at::neg(at::abs(at::sub(local_max, global_max)));
+    auto alpha = at::exp(log_alpha);
+    alpha.nan_to_num_(1.);
+
+    auto pick_new = at::less(local_max, global_max);
+    auto pick_current_coef = at::where(pick_new, 1., alpha);
+    auto pick_new_coef = at::where(pick_new, alpha, 1.);
+
+    O = at::add(at::mul(pick_current_coef, O), at::mul(pick_new_coef, local_O));
+    global_sumexp = at::add(
+        at::mul(pick_current_coef, global_sumexp),
+        at::mul(pick_new_coef, local_sumexp));
+    global_max = at::max(local_max, global_max);
+  }
+
+  return at::div(O, global_sumexp);
 }
 
 static at::Tensor efficient_attention_forward_decoder_splitk_torch(
-    const at::Tensor& XQ,                 // [B, 1, G, H, D]
-    const at::Tensor& cache_K,            // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V,            // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& XQ, // [B, 1, G, H, D]
+    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int32_t split_k,
-    int32_t block_size)
-{
-    auto [O_split, m, l] =
-        split_attention_torch(XQ, cache_K, cache_V, *seq_kv_lens, split_k, block_size);
-    auto O = split_reduce_torch(O_split, m, l, split_k);
-    return O.reshape_as(XQ);
+    int32_t block_size) {
+  auto [O_split, m, l] = split_attention_torch(
+      XQ, cache_K, cache_V, *seq_kv_lens, split_k, block_size);
+  auto O = split_reduce_torch(O_split, m, l, split_k);
+  return O.reshape_as(XQ);
 }
 
 namespace ck {
@@ -362,769 +382,781 @@ namespace tensor_operation {
 namespace device {
 
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator
-{
-    using DeviceOp = FMHADecoderSplitAttentionDeviceOp;
-    struct Argument : public BaseArgument
-    {
-        const scalar_t* __restrict__ XQ;
-        const scalar_t* __restrict__ cache_K;
-        const scalar_t* __restrict__ cache_V;
-        scalar_t* __restrict__ O;
-        scalar_t* __restrict__ split_O;
-        compute_t* __restrict__ split_max;
-        compute_t* __restrict__ split_sumexp;
-        const int32_t* __restrict__ seq_kv_lens;
-        const ptrdiff_t XQ_stride_b;
-        const ptrdiff_t XQ_stride_m;
-        const ptrdiff_t XQ_stride_g;
-        const ptrdiff_t XQ_stride_h;
-        const ptrdiff_t K_stride_b;
-        const ptrdiff_t K_stride_m;
-        const ptrdiff_t K_stride_g;
-        const ptrdiff_t K_stride_h;
-        const ptrdiff_t O_stride_split;
-        const int32_t Q_size_m;
-        const int32_t Q_size_g;
-        const int32_t Q_size_h;
-        const int32_t Q_size_k;
-        const int32_t K_size_m;
-        const bool multiquery;
-        const float qk_scale;
-        const int32_t split_k;
-
-        const dim3 grid_dim;
-        const dim3 block_dim;
-        const size_t lds_bytes;
-
-        Argument(const scalar_t* __restrict__ XQ,
-                 const scalar_t* __restrict__ cache_K,
-                 const scalar_t* __restrict__ cache_V,
-                 scalar_t* __restrict__ O,
-                 scalar_t* __restrict__ split_O,
-                 compute_t* __restrict__ split_max,
-                 compute_t* __restrict__ split_sumexp,
-                 const int32_t* __restrict__ seq_kv_lens,
-                 const ptrdiff_t XQ_stride_b,
-                 const ptrdiff_t XQ_stride_m,
-                 const ptrdiff_t XQ_stride_g,
-                 const ptrdiff_t XQ_stride_h,
-                 const ptrdiff_t K_stride_b,
-                 const ptrdiff_t K_stride_m,
-                 const ptrdiff_t K_stride_g,
-                 const ptrdiff_t K_stride_h,
-                 const ptrdiff_t O_stride_split,
-                 const int32_t Q_size_m,
-                 const int32_t Q_size_g,
-                 const int32_t Q_size_h,
-                 const int32_t Q_size_k,
-                 const int32_t K_size_m,
-                 const bool multiquery,
-                 const float qk_scale,
-                 const int32_t split_k,
-                 // launch params
-                 const dim3 grid_dim,
-                 const dim3 block_dim,
-                 const size_t lds_bytes)
-            : XQ(XQ),
-              cache_K(cache_K),
-              cache_V(cache_V),
-              O(O),
-              split_O(split_O),
-              split_max(split_max),
-              split_sumexp(split_sumexp),
-              seq_kv_lens(seq_kv_lens),
-              XQ_stride_b(XQ_stride_b),
-              XQ_stride_m(XQ_stride_m),
-              XQ_stride_g(XQ_stride_g),
-              XQ_stride_h(XQ_stride_h),
-              K_stride_b(K_stride_b),
-              K_stride_m(K_stride_m),
-              K_stride_g(K_stride_g),
-              K_stride_h(K_stride_h),
-              O_stride_split(O_stride_split),
-              Q_size_m(Q_size_m),
-              Q_size_g(Q_size_g),
-              Q_size_h(Q_size_h),
-              Q_size_k(Q_size_k),
-              K_size_m(K_size_m),
-              multiquery(multiquery),
-              qk_scale(qk_scale),
-              split_k(split_k),
-              // launch params
-              grid_dim(grid_dim),
-              block_dim(block_dim),
-              lds_bytes(lds_bytes)
-        {
-        }
-
-        std::string str() const
-        {
-            std::ostringstream oss;
-            oss << "Argument { " << std::endl
-                << "    XQ: " << XQ << std::endl
-                << "    cache_K: " << cache_K << std::endl
-                << "    cache_V: " << cache_V << std::endl
-                << "    O: " << O << std::endl
-                << "    split_O: " << split_O << std::endl
-                << "    split_max: " << split_max << std::endl
-                << "    split_sumexp: " << split_sumexp << std::endl
-                << "    seq_kv_lens: " << seq_kv_lens << std::endl
-                << "    XQ_stride_b: " << XQ_stride_b << std::endl
-                << "    XQ_stride_m: " << XQ_stride_m << std::endl
-                << "    XQ_stride_g: " << XQ_stride_g << std::endl
-                << "    XQ_stride_h: " << XQ_stride_h << std::endl
-                << "    K_stride_b: " << K_stride_b << std::endl
-                << "    K_stride_m: " << K_stride_m << std::endl
-                << "    K_stride_g: " << K_stride_g << std::endl
-                << "    K_stride_h: " << K_stride_h << std::endl
-                << "    O_stride_split: " << O_stride_split << std::endl
-                << "    Q_size_m: " << Q_size_m << std::endl
-                << "    Q_size_g: " << Q_size_g << std::endl
-                << "    Q_size_h: " << Q_size_h << std::endl
-                << "    Q_size_k: " << Q_size_k << std::endl
-                << "    K_size_m: " << K_size_m << std::endl
-                << "    multiquery: " << multiquery << std::endl
-                << "    qk_scale: " << qk_scale << std::endl
-                << "    split_k: " << split_k << std::endl
-                << std::endl
-                << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z
-                << std::endl
-                << "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z
-                << std::endl
-                << "    lds_bytes: " << lds_bytes << std::endl
-                << "}";
-            return oss.str();
-        }
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            auto threads_per_wavefront = arg.block_dim.x;
-            auto Q_size_k_alignment_necessary = 0;
-
-            for(auto vec_size : {4, 2, 1})
-            {
-                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
-                {
-                    Q_size_k_alignment_necessary = vec_size;
-                }
-            }
-
-            if(!Q_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported Q_size_k");
-            }
-
-            if(arg.Q_size_k % Q_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported alignment for Q_size_k");
-            }
-
-            float split_attention_result = launch_and_time_kernel(
-                stream_config,
-                Q_size_k_alignment_necessary == 4
-                    ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
-                    : Q_size_k_alignment_necessary == 2
-                          ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
-                          : Q_size_k_alignment_necessary == 1
-                                ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
-                                : nullptr,
-                arg.grid_dim,
-                arg.block_dim,
-                arg.lds_bytes,
-                arg.XQ,
-                arg.cache_K,
-                arg.cache_V,
-                arg.split_O,
-                arg.split_max,
-                arg.split_sumexp,
-                arg.seq_kv_lens,
-                arg.XQ_stride_b,
-                arg.XQ_stride_m,
-                arg.XQ_stride_g,
-                arg.XQ_stride_h,
-                arg.K_stride_b,
-                arg.K_stride_m,
-                arg.K_stride_g,
-                arg.K_stride_h,
-                arg.O_stride_split,
-                arg.Q_size_m,
-                arg.Q_size_g,
-                arg.Q_size_h,
-                arg.Q_size_k,
-                arg.K_size_m,
-                arg.multiquery,
-                arg.qk_scale,
-                arg.split_k);
-
-            return split_attention_result;
+struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSplitAttentionDeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    scalar_t* __restrict__ split_O;
+    compute_t* __restrict__ split_max;
+    compute_t* __restrict__ split_sumexp;
+    const int32_t* __restrict__ seq_kv_lens;
+    const ptrdiff_t XQ_stride_b;
+    const ptrdiff_t XQ_stride_m;
+    const ptrdiff_t XQ_stride_g;
+    const ptrdiff_t XQ_stride_h;
+    const ptrdiff_t K_stride_b;
+    const ptrdiff_t K_stride_m;
+    const ptrdiff_t K_stride_g;
+    const ptrdiff_t K_stride_h;
+    const ptrdiff_t O_stride_split;
+    const int32_t Q_size_m;
+    const int32_t Q_size_g;
+    const int32_t Q_size_h;
+    const int32_t Q_size_k;
+    const int32_t K_size_m;
+    const bool multiquery;
+    const float qk_scale;
+    const int32_t split_k;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        scalar_t* __restrict__ split_O,
+        compute_t* __restrict__ split_max,
+        compute_t* __restrict__ split_sumexp,
+        const int32_t* __restrict__ seq_kv_lens,
+        const ptrdiff_t XQ_stride_b,
+        const ptrdiff_t XQ_stride_m,
+        const ptrdiff_t XQ_stride_g,
+        const ptrdiff_t XQ_stride_h,
+        const ptrdiff_t K_stride_b,
+        const ptrdiff_t K_stride_m,
+        const ptrdiff_t K_stride_g,
+        const ptrdiff_t K_stride_h,
+        const ptrdiff_t O_stride_split,
+        const int32_t Q_size_m,
+        const int32_t Q_size_g,
+        const int32_t Q_size_h,
+        const int32_t Q_size_k,
+        const int32_t K_size_m,
+        const bool multiquery,
+        const float qk_scale,
+        const int32_t split_k,
+        // launch params
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          split_O(split_O),
+          split_max(split_max),
+          split_sumexp(split_sumexp),
+          seq_kv_lens(seq_kv_lens),
+          XQ_stride_b(XQ_stride_b),
+          XQ_stride_m(XQ_stride_m),
+          XQ_stride_g(XQ_stride_g),
+          XQ_stride_h(XQ_stride_h),
+          K_stride_b(K_stride_b),
+          K_stride_m(K_stride_m),
+          K_stride_g(K_stride_g),
+          K_stride_h(K_stride_h),
+          O_stride_split(O_stride_split),
+          Q_size_m(Q_size_m),
+          Q_size_g(Q_size_g),
+          Q_size_h(Q_size_h),
+          Q_size_k(Q_size_k),
+          K_size_m(K_size_m),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          split_k(split_k),
+          // launch params
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+
+    std::string str() const {
+      std::ostringstream oss;
+      oss << "Argument { " << std::endl
+          << "    XQ: " << XQ << std::endl
+          << "    cache_K: " << cache_K << std::endl
+          << "    cache_V: " << cache_V << std::endl
+          << "    O: " << O << std::endl
+          << "    split_O: " << split_O << std::endl
+          << "    split_max: " << split_max << std::endl
+          << "    split_sumexp: " << split_sumexp << std::endl
+          << "    seq_kv_lens: " << seq_kv_lens << std::endl
+          << "    XQ_stride_b: " << XQ_stride_b << std::endl
+          << "    XQ_stride_m: " << XQ_stride_m << std::endl
+          << "    XQ_stride_g: " << XQ_stride_g << std::endl
+          << "    XQ_stride_h: " << XQ_stride_h << std::endl
+          << "    K_stride_b: " << K_stride_b << std::endl
+          << "    K_stride_m: " << K_stride_m << std::endl
+          << "    K_stride_g: " << K_stride_g << std::endl
+          << "    K_stride_h: " << K_stride_h << std::endl
+          << "    O_stride_split: " << O_stride_split << std::endl
+          << "    Q_size_m: " << Q_size_m << std::endl
+          << "    Q_size_g: " << Q_size_g << std::endl
+          << "    Q_size_h: " << Q_size_h << std::endl
+          << "    Q_size_k: " << Q_size_k << std::endl
+          << "    K_size_m: " << K_size_m << std::endl
+          << "    multiquery: " << multiquery << std::endl
+          << "    qk_scale: " << qk_scale << std::endl
+          << "    split_k: " << split_k << std::endl
+          << std::endl
+          << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "."
+          << grid_dim.z << std::endl
+          << "    block_dim: " << block_dim.x << "." << block_dim.y << "."
+          << block_dim.z << std::endl
+          << "    lds_bytes: " << lds_bytes << std::endl
+          << "}";
+      return oss.str();
+    }
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+      auto Q_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+          Q_size_k_alignment_necessary = vec_size;
         }
-    };
+      }
+
+      if (!Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported Q_size_k");
+      }
+
+      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for Q_size_k");
+      }
+
+      float split_attention_result = launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    4>
+              : Q_size_k_alignment_necessary == 2
+                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                        scalar_t,
+                        2>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.seq_kv_lens,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.K_stride_b,
+          arg.K_stride_m,
+          arg.K_stride_g,
+          arg.K_stride_h,
+          arg.O_stride_split,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.K_size_m,
+          arg.multiquery,
+          arg.qk_scale,
+          arg.split_k);
+
+      return split_attention_result;
+    }
+  };
 };
 
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderSplitReduceDeviceOp : public BaseOperator
-{
-    using DeviceOp = FMHADecoderSplitReduceDeviceOp;
-    struct Argument : public BaseArgument
-    {
-        const scalar_t* __restrict__ split_O;
-        const compute_t* __restrict__ split_max;
-        const compute_t* __restrict__ split_sumexp;
-        scalar_t* __restrict__ O;
-
-        const int32_t O_size_m;
-        const int32_t O_size_g;
-        const int32_t O_size_h;
-        const int32_t O_size_k;
-
-        const ptrdiff_t O_stride_split;
-        const ptrdiff_t O_stride_b;
-        const ptrdiff_t O_stride_m;
-        const ptrdiff_t O_stride_g;
-        const ptrdiff_t O_stride_h;
-
-        const int32_t split_k;
-
-        const dim3 grid_dim;
-        const dim3 block_dim;
-        const size_t lds_bytes;
-
-        Argument(const scalar_t* __restrict__ split_O,
-                 const compute_t* __restrict__ split_max,
-                 const compute_t* __restrict__ split_sumexp,
-                 scalar_t* __restrict__ O,
-                 const int32_t O_size_m,
-                 const int32_t O_size_g,
-                 const int32_t O_size_h,
-                 const int32_t O_size_k,
-                 const ptrdiff_t O_stride_split,
-                 const ptrdiff_t O_stride_b,
-                 const ptrdiff_t O_stride_m,
-                 const ptrdiff_t O_stride_g,
-                 const ptrdiff_t O_stride_h,
-                 const int32_t split_k,
-                 // launch params
-                 const dim3 grid_dim,
-                 const dim3 block_dim,
-                 const size_t lds_bytes)
-            : split_O(split_O),
-              split_max(split_max),
-              split_sumexp(split_sumexp),
-              O(O),
-              O_size_m(O_size_m),
-              O_size_g(O_size_g),
-              O_size_h(O_size_h),
-              O_size_k(O_size_k),
-              O_stride_split(O_stride_split),
-              O_stride_b(O_stride_b),
-              O_stride_m(O_stride_m),
-              O_stride_g(O_stride_g),
-              O_stride_h(O_stride_h),
-              split_k(split_k),
-              // launch params
-              grid_dim(grid_dim),
-              block_dim(block_dim),
-              lds_bytes(lds_bytes)
-        {
-        }
-
-        std::string str() const
-        {
-            std::ostringstream oss;
-            oss << "Argument { " << std::endl
-                << "    O: " << O << std::endl
-                << "    split_O: " << split_O << std::endl
-                << "    split_max: " << split_max << std::endl
-                << "    split_sumexp: " << split_sumexp << std::endl
-                << "    O_stride_b: " << O_stride_b << std::endl
-                << "    O_stride_m: " << O_stride_m << std::endl
-                << "    O_stride_g: " << O_stride_g << std::endl
-                << "    O_stride_h: " << O_stride_h << std::endl
-                << "    O_stride_split: " << O_stride_split << std::endl
-                << "    O_size_m: " << O_size_m << std::endl
-                << "    O_size_g: " << O_size_g << std::endl
-                << "    O_size_h: " << O_size_h << std::endl
-                << "    O_size_k: " << O_size_k << std::endl
-                << "    split_k: " << split_k << std::endl
-                << std::endl
-                << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z
-                << std::endl
-                << "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z
-                << std::endl
-                << "    lds_bytes: " << lds_bytes << std::endl
-                << "}";
-            return oss.str();
-        }
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            auto threads_per_wavefront = arg.block_dim.x;
-            auto O_size_k_alignment_necessary = 0;
-
-            for(auto vec_size : {4, 2, 1})
-            {
-                if(arg.O_size_k <= vec_size * threads_per_wavefront)
-                {
-                    O_size_k_alignment_necessary = vec_size;
-                }
-            }
-
-            if(!O_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported O_size_k");
-            }
-
-            if(arg.O_size_k % O_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported alignment for O_size_k");
-            }
-
-            const dim3 reduce_gridsize         = {arg.grid_dim.x};
-            const dim3 reduce_blocksize        = {arg.block_dim.x};
-            constexpr int32_t reduce_lds_bytes = 0;
-            float reduce_result                = launch_and_time_kernel(
-                stream_config,
-                O_size_k_alignment_necessary == 4
-                    ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
-                    : O_size_k_alignment_necessary == 2
-                          ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
-                          : O_size_k_alignment_necessary == 1
-                                ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                                      scalar_t,
-                                      1>
-                                : nullptr,
-                reduce_gridsize,
-                reduce_blocksize,
-                reduce_lds_bytes,
-                arg.split_O,
-                arg.split_max,
-                arg.split_sumexp,
-                arg.O,
-                arg.O_size_m,
-                arg.O_size_g,
-                arg.O_size_h,
-                arg.O_size_k,
-                arg.O_stride_split,
-                arg.O_stride_b,
-                arg.O_stride_m,
-                arg.O_stride_g,
-                arg.O_stride_h,
-                arg.split_k);
-            return reduce_result;
+struct FMHADecoderSplitReduceDeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSplitReduceDeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ split_O;
+    const compute_t* __restrict__ split_max;
+    const compute_t* __restrict__ split_sumexp;
+    scalar_t* __restrict__ O;
+
+    const int32_t O_size_m;
+    const int32_t O_size_g;
+    const int32_t O_size_h;
+    const int32_t O_size_k;
+
+    const ptrdiff_t O_stride_split;
+    const ptrdiff_t O_stride_b;
+    const ptrdiff_t O_stride_m;
+    const ptrdiff_t O_stride_g;
+    const ptrdiff_t O_stride_h;
+
+    const int32_t split_k;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ split_O,
+        const compute_t* __restrict__ split_max,
+        const compute_t* __restrict__ split_sumexp,
+        scalar_t* __restrict__ O,
+        const int32_t O_size_m,
+        const int32_t O_size_g,
+        const int32_t O_size_h,
+        const int32_t O_size_k,
+        const ptrdiff_t O_stride_split,
+        const ptrdiff_t O_stride_b,
+        const ptrdiff_t O_stride_m,
+        const ptrdiff_t O_stride_g,
+        const ptrdiff_t O_stride_h,
+        const int32_t split_k,
+        // launch params
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : split_O(split_O),
+          split_max(split_max),
+          split_sumexp(split_sumexp),
+          O(O),
+          O_size_m(O_size_m),
+          O_size_g(O_size_g),
+          O_size_h(O_size_h),
+          O_size_k(O_size_k),
+          O_stride_split(O_stride_split),
+          O_stride_b(O_stride_b),
+          O_stride_m(O_stride_m),
+          O_stride_g(O_stride_g),
+          O_stride_h(O_stride_h),
+          split_k(split_k),
+          // launch params
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+
+    std::string str() const {
+      std::ostringstream oss;
+      oss << "Argument { " << std::endl
+          << "    O: " << O << std::endl
+          << "    split_O: " << split_O << std::endl
+          << "    split_max: " << split_max << std::endl
+          << "    split_sumexp: " << split_sumexp << std::endl
+          << "    O_stride_b: " << O_stride_b << std::endl
+          << "    O_stride_m: " << O_stride_m << std::endl
+          << "    O_stride_g: " << O_stride_g << std::endl
+          << "    O_stride_h: " << O_stride_h << std::endl
+          << "    O_stride_split: " << O_stride_split << std::endl
+          << "    O_size_m: " << O_size_m << std::endl
+          << "    O_size_g: " << O_size_g << std::endl
+          << "    O_size_h: " << O_size_h << std::endl
+          << "    O_size_k: " << O_size_k << std::endl
+          << "    split_k: " << split_k << std::endl
+          << std::endl
+          << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "."
+          << grid_dim.z << std::endl
+          << "    block_dim: " << block_dim.x << "." << block_dim.y << "."
+          << block_dim.z << std::endl
+          << "    lds_bytes: " << lds_bytes << std::endl
+          << "}";
+      return oss.str();
+    }
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+      auto O_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.O_size_k <= vec_size * threads_per_wavefront) {
+          O_size_k_alignment_necessary = vec_size;
         }
-    };
+      }
+
+      if (!O_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported O_size_k");
+      }
+
+      if (arg.O_size_k % O_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for O_size_k");
+      }
+
+      const dim3 reduce_gridsize = {arg.grid_dim.x};
+      const dim3 reduce_blocksize = {arg.block_dim.x};
+      constexpr int32_t reduce_lds_bytes = 0;
+      float reduce_result = launch_and_time_kernel(
+          stream_config,
+          O_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    4>
+              : O_size_k_alignment_necessary == 2
+                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                        scalar_t,
+                        2>
+                  : O_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
+          reduce_gridsize,
+          reduce_blocksize,
+          reduce_lds_bytes,
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.O,
+          arg.O_size_m,
+          arg.O_size_g,
+          arg.O_size_h,
+          arg.O_size_k,
+          arg.O_stride_split,
+          arg.O_stride_b,
+          arg.O_stride_m,
+          arg.O_stride_g,
+          arg.O_stride_h,
+          arg.split_k);
+      return reduce_result;
+    }
+  };
 };
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
 
-static std::tuple<at::Tensor, at::Tensor, at::Tensor>
-split_attention_hip(const at::Tensor& XQ,
-                    const at::Tensor& K,
-                    const at::Tensor& V,
-                    const at::Tensor& seqlen,
-                    const int32_t split_k,
-                    const int32_t wavefronts_per_block)
-{
-
-    at::OptionalDeviceGuard guard(XQ.device());
-
-    auto B = XQ.size(0);
-    auto M = XQ.size(1);
-    auto G = XQ.size(2);
-    auto H = XQ.size(3);
-    auto D = XQ.size(4);
-
-    double qk_scale = 1. / sqrt(D);
-
-    auto O              = at::empty_like(XQ);
-    constexpr auto rank = 5;
-    auto split_O        = at::zeros({split_k, B, M, G, H, D}, XQ.options());
-    auto split_max      = at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat))
-                         .fill_(ck::NumericLimits<float>::Lowest());
-    auto split_sumexp = at::zeros_like(split_max);
-
-    dim3 blocks(B * H * M * G, split_k);
-    dim3 threads(kThreadsPerWavefront, wavefronts_per_block);
-
-    constexpr int32_t KV_M_MAX = 8192;
-    constexpr int32_t K_MAX    = 4 * kThreadsPerWavefront;
-
-    int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-    int32_t smem_output  = K_MAX * sizeof(float) *
-                          threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
-    const size_t lds_bytes = max(smem_softmax, smem_output);
-    auto stream            = at::cuda::getCurrentHIPStream().stream();
-
-    AT_DISPATCH_SWITCH_3(
-        at::ScalarType::Half,
-        at::ScalarType::BFloat16,
-        at::ScalarType::Float,
-        XQ.scalar_type(),
-        "efficient_attention_forward_decoder_split_attention_ck_test",
-        [&] {
-            using ck_data_t = c10_to_data_t<scalar_t>::type;
-            using device_op_t =
-                ck::tensor_operation::device::FMHADecoderSplitAttentionDeviceOp<ck_data_t>;
-            auto op = device_op_t{};
-
-            auto XQ_acc = XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto K_acc  = K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-            auto V_acc  = V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
-            auto split_O_acc =
-                split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-            auto O_acc         = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto seq_acc       = seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
-            auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-            auto split_sumexp_acc =
-                split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-            auto arg = device_op_t::Argument(
-                reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
-                reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
-                reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
-                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-                reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
-                split_max_acc.data(),
-                split_sumexp_acc.data(),
-                seq_acc.data(),
-                XQ_acc.stride(0),
-                XQ_acc.stride(1),
-                XQ_acc.stride(2),
-                XQ_acc.stride(3),
-                K_acc.stride(0),
-                K_acc.stride(1),
-                K_acc.stride(2),
-                K_acc.stride(3),
-                split_O_acc.stride(0),
-                XQ_acc.size(1),
-                XQ_acc.size(2),
-                XQ_acc.size(3),
-                XQ_acc.size(4),
-                K_acc.size(1),
-                K_acc.size(3) == 1,
-                qk_scale,
-                split_k,
-                blocks,
-                threads,
-                lds_bytes);
-
-            auto invoker = device_op_t::Invoker{};
-            (void)invoker.Run(arg, {stream});
-        });
-    return std::make_tuple(split_O, split_max, split_sumexp);
+static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(
+    const at::Tensor& XQ,
+    const at::Tensor& K,
+    const at::Tensor& V,
+    const at::Tensor& seqlen,
+    const int32_t split_k,
+    const int32_t wavefronts_per_block) {
+  at::OptionalDeviceGuard guard(XQ.device());
+
+  auto B = XQ.size(0);
+  auto M = XQ.size(1);
+  auto G = XQ.size(2);
+  auto H = XQ.size(3);
+  auto D = XQ.size(4);
+
+  double qk_scale = 1. / sqrt(D);
+
+  auto O = at::empty_like(XQ);
+  constexpr auto rank = 5;
+  auto split_O = at::zeros({split_k, B, M, G, H, D}, XQ.options());
+  auto split_max =
+      at::empty({B, M, G, H, split_k}, XQ.options().dtype(at::kFloat))
+          .fill_(ck::NumericLimits<float>::Lowest());
+  auto split_sumexp = at::zeros_like(split_max);
+
+  dim3 blocks(B * H * M * G, split_k);
+  dim3 threads(kThreadsPerWavefront, wavefronts_per_block);
+
+  constexpr int32_t KV_M_MAX = 8192;
+  constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
+
+  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = K_MAX * sizeof(float) *
+      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  const size_t lds_bytes = max(smem_softmax, smem_output);
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      XQ.scalar_type(),
+      "efficient_attention_forward_decoder_split_attention_ck_test",
+      [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSplitAttentionDeviceOp<
+                ck_data_t>;
+        auto op = device_op_t{};
+
+        auto XQ_acc =
+            XQ.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto K_acc =
+            K.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto V_acc =
+            V.packed_accessor64<scalar_t, rank, at::RestrictPtrTraits>();
+        auto split_O_acc =
+            split_O
+                .packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+        auto O_acc =
+            O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto seq_acc =
+            seqlen.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>();
+        auto split_max_acc =
+            split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto split_sumexp_acc =
+            split_sumexp
+                .packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto arg = device_op_t::Argument(
+            reinterpret_cast<const ck_data_t* __restrict__>(XQ_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(K_acc.data()),
+            reinterpret_cast<const ck_data_t* __restrict__>(V_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+            reinterpret_cast<ck_data_t* __restrict__>(split_O_acc.data()),
+            split_max_acc.data(),
+            split_sumexp_acc.data(),
+            seq_acc.data(),
+            XQ_acc.stride(0),
+            XQ_acc.stride(1),
+            XQ_acc.stride(2),
+            XQ_acc.stride(3),
+            K_acc.stride(0),
+            K_acc.stride(1),
+            K_acc.stride(2),
+            K_acc.stride(3),
+            split_O_acc.stride(0),
+            XQ_acc.size(1),
+            XQ_acc.size(2),
+            XQ_acc.size(3),
+            XQ_acc.size(4),
+            K_acc.size(1),
+            K_acc.size(3) == 1,
+            qk_scale,
+            split_k,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
+      });
+  return std::make_tuple(split_O, split_max, split_sumexp);
 }
 
-static at::Tensor split_reduce_hip(const at::Tensor& split_O,
-                                   const at::Tensor& split_max,
-                                   const at::Tensor& split_sumexp,
-                                   const int32_t split_k)
-{
-    at::OptionalDeviceGuard guard(split_O.device());
-
-    auto B = split_O.size(1);
-    auto M = split_O.size(2);
-    auto G = split_O.size(3);
-    auto H = split_O.size(4);
-    auto D = split_O.size(5);
-
-    TORCH_CHECK_EQ(split_k, split_O.size(0));
-    TORCH_CHECK_EQ(split_k, split_max.size(-1));
-    TORCH_CHECK_EQ(split_k, split_sumexp.size(-1));
-
-    constexpr auto rank = 5;
-
-    TORCH_CHECK_EQ(split_O.dim(), 1 + rank);
-    TORCH_CHECK_EQ(split_max.dim(), rank);
-    TORCH_CHECK_EQ(split_sumexp.dim(), rank);
-
-    auto O = at::zeros({B, M, G, H, D}, split_O.options());
-
-    auto stream    = at::cuda::getCurrentHIPStream().stream();
-    auto lds_bytes = 0;
-
-    dim3 blocks(B * H * M * G);
-    dim3 threads(kThreadsPerWavefront);
-
-    AT_DISPATCH_SWITCH_3(
-        at::ScalarType::Half,
-        at::ScalarType::BFloat16,
-        at::ScalarType::Float,
-        O.scalar_type(),
-        "efficient_attention_forward_decoder_split_reduce_ck_test",
-        [&] {
-            using ck_data_t = c10_to_data_t<scalar_t>::type;
-            using device_op_t =
-                ck::tensor_operation::device::FMHADecoderSplitReduceDeviceOp<ck_data_t>;
-            auto op = device_op_t{};
-
-            auto split_O_acc =
-                split_O.packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
-            auto O_acc         = O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
-            auto split_max_acc = split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-            auto split_sumexp_acc =
-                split_sumexp.packed_accessor32<float, rank, at::RestrictPtrTraits>();
-            auto arg = device_op_t::Argument(
-                reinterpret_cast<const ck_data_t* __restrict__>(split_O_acc.data()),
-                split_max_acc.data(),
-                split_sumexp_acc.data(),
-                reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
-                O_acc.size(1),
-                O_acc.size(2),
-                O_acc.size(3),
-                O_acc.size(4),
-                split_O_acc.stride(0),
-                O_acc.stride(0),
-                O_acc.stride(1),
-                O_acc.stride(2),
-                O_acc.stride(3),
-                split_k,
-                blocks,
-                threads,
-                lds_bytes);
-
-            auto invoker = device_op_t::Invoker{};
-            (void)invoker.Run(arg, {stream});
-        });
-    return O;
+static at::Tensor split_reduce_hip(
+    const at::Tensor& split_O,
+    const at::Tensor& split_max,
+    const at::Tensor& split_sumexp,
+    const int32_t split_k) {
+  at::OptionalDeviceGuard guard(split_O.device());
+
+  auto B = split_O.size(1);
+  auto M = split_O.size(2);
+  auto G = split_O.size(3);
+  auto H = split_O.size(4);
+  auto D = split_O.size(5);
+
+  TORCH_CHECK_EQ(split_k, split_O.size(0));
+  TORCH_CHECK_EQ(split_k, split_max.size(-1));
+  TORCH_CHECK_EQ(split_k, split_sumexp.size(-1));
+
+  constexpr auto rank = 5;
+
+  TORCH_CHECK_EQ(split_O.dim(), 1 + rank);
+  TORCH_CHECK_EQ(split_max.dim(), rank);
+  TORCH_CHECK_EQ(split_sumexp.dim(), rank);
+
+  auto O = at::zeros({B, M, G, H, D}, split_O.options());
+
+  auto stream = at::cuda::getCurrentHIPStream().stream();
+  auto lds_bytes = 0;
+
+  dim3 blocks(B * H * M * G);
+  dim3 threads(kThreadsPerWavefront);
+
+  AT_DISPATCH_SWITCH_3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float,
+      O.scalar_type(),
+      "efficient_attention_forward_decoder_split_reduce_ck_test",
+      [&] {
+        using ck_data_t = c10_to_data_t<scalar_t>::type;
+        using device_op_t =
+            ck::tensor_operation::device::FMHADecoderSplitReduceDeviceOp<
+                ck_data_t>;
+        auto op = device_op_t{};
+
+        auto split_O_acc =
+            split_O
+                .packed_accessor32<scalar_t, 1 + rank, at::RestrictPtrTraits>();
+        auto O_acc =
+            O.packed_accessor32<scalar_t, rank, at::RestrictPtrTraits>();
+        auto split_max_acc =
+            split_max.packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto split_sumexp_acc =
+            split_sumexp
+                .packed_accessor32<float, rank, at::RestrictPtrTraits>();
+        auto arg = device_op_t::Argument(
+            reinterpret_cast<const ck_data_t* __restrict__>(split_O_acc.data()),
+            split_max_acc.data(),
+            split_sumexp_acc.data(),
+            reinterpret_cast<ck_data_t* __restrict__>(O_acc.data()),
+            O_acc.size(1),
+            O_acc.size(2),
+            O_acc.size(3),
+            O_acc.size(4),
+            split_O_acc.stride(0),
+            O_acc.stride(0),
+            O_acc.stride(1),
+            O_acc.stride(2),
+            O_acc.stride(3),
+            split_k,
+            blocks,
+            threads,
+            lds_bytes);
+
+        auto invoker = device_op_t::Invoker{};
+        (void)invoker.Run(arg, {stream});
+      });
+  return O;
 }
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-generate_inputs(const int32_t padding,
-                const int32_t B,
-                const int32_t Hq,
-                const int32_t Hkv,
-                const decltype(torch::kFloat32) dtype = torch::kFloat32)
-{
-    const int32_t D           = 4 * kThreadsPerWavefront;
-    const int32_t G           = Hq / Hkv;
-    const int32_t num_queries = 1;
-
-    at::manual_seed(1);
-
-    auto options = torch::TensorOptions()
-                       .dtype(dtype)
-                       .layout(torch::kStrided)
-                       .device(torch::kCUDA, 1)
-                       .requires_grad(false);
-    auto int_options = options.dtype(torch::kInt);
-    auto XQ          = at::randn({B, num_queries, G, Hq, D}, options);
-    auto K           = (G == 1) ? at::randn({B, padding, G, Hkv, D}, options)
-                      : at::randn({B, padding, G, 1, D}, options).expand({B, padding, G, Hq, D});
-    auto V      = at::randn_like(K);
-    auto seqlen = at::randint(num_queries, padding + 1, {B}, int_options);
-
-    return std::make_tuple(XQ, K, V, seqlen);
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> generate_inputs(
+    const int32_t padding,
+    const int32_t B,
+    const int32_t Hq,
+    const int32_t Hkv,
+    const decltype(torch::kFloat32) dtype = torch::kFloat32) {
+  const int32_t D = 4 * kThreadsPerWavefront;
+  const int32_t G = Hq / Hkv;
+  const int32_t num_queries = 1;
+
+  at::manual_seed(1);
+
+  auto options = torch::TensorOptions()
+                     .dtype(dtype)
+                     .layout(torch::kStrided)
+                     .device(torch::kCUDA, 1)
+                     .requires_grad(false);
+  auto int_options = options.dtype(torch::kInt);
+  auto XQ = at::randn({B, num_queries, G, Hq, D}, options);
+  auto K = (G == 1) ? at::randn({B, padding, G, Hkv, D}, options)
+                    : at::randn({B, padding, G, 1, D}, options)
+                          .expand({B, padding, G, Hq, D});
+  auto V = at::randn_like(K);
+  auto seqlen = at::randint(num_queries, padding + 1, {B}, int_options);
+
+  return std::make_tuple(XQ, K, V, seqlen);
 }
 
-static float percent_mismatch(const at::Tensor& a, const at::Tensor& b)
-{
-    auto mask          = at::isclose(a, b, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
-    auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
-    return 1. - percent_match.item<float>();
+static float percent_mismatch(const at::Tensor& a, const at::Tensor& b) {
+  auto mask =
+      at::isclose(a, b, /*atol*/ 1e-3, /*rtol*/ 1e-5, /*equal_nan*/ false);
+  auto percent_match = at::sum(mask.to(torch::kFloat32)) / mask.numel();
+  return 1. - percent_match.item<float>();
 }
 
-static void
-test_split_attention(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
-{
-    auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
-
-    auto [O_ref, m_ref, l_ref] =
-        split_attention_torch(XQ, K, V, seqlen, split_k, /* block_size */ kWavefrontsPerBlock * 16);
-
-    auto [O_hip, m_hip, l_hip] =
-        split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
-
-    auto O_percent_mismatch = percent_mismatch(O_ref, O_hip);
-    auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
-    auto l_percent_mismatch = percent_mismatch(l_ref, l_hip);
-
-    printf("[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O "
-           "elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched "
-           "split_sumexp elements percentage: %.2f\n",
-           padding,
-           batch_size,
-           Hq,
-           Hkv,
-           split_k,
-           O_percent_mismatch,
-           m_percent_mismatch,
-           l_percent_mismatch);
+static void test_split_attention(
+    int32_t padding,
+    int32_t batch_size,
+    int32_t Hq,
+    int32_t Hkv,
+    int32_t split_k) {
+  auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
+
+  auto [O_ref, m_ref, l_ref] = split_attention_torch(
+      XQ, K, V, seqlen, split_k, /* block_size */ kWavefrontsPerBlock * 16);
+
+  auto [O_hip, m_hip, l_hip] =
+      split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
+
+  auto O_percent_mismatch = percent_mismatch(O_ref, O_hip);
+  auto m_percent_mismatch = percent_mismatch(m_ref, m_hip);
+  auto l_percent_mismatch = percent_mismatch(l_ref, l_hip);
+
+  printf(
+      "[Test split attention] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched split_O "
+      "elements percentage: %.2f Mismatched split_max elements percentage: %.2f Mismatched "
+      "split_sumexp elements percentage: %.2f\n",
+      padding,
+      batch_size,
+      Hq,
+      Hkv,
+      split_k,
+      O_percent_mismatch,
+      m_percent_mismatch,
+      l_percent_mismatch);
 }
 
-static void
-test_split_reduce(int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
-{
-    auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
-
-    auto [O_ref, m_ref, l_ref] =
-        split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
-
-    auto O_torch = split_reduce_torch(O_ref, m_ref.unsqueeze(0), l_ref.unsqueeze(0), split_k);
-    auto O_hip   = split_reduce_hip(O_ref, m_ref, l_ref, split_k);
-
-    auto hip_torch_mismatch = percent_mismatch(O_hip, O_torch);
-    printf("[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements "
-           "percentage: %.2f \n",
-           padding,
-           batch_size,
-           Hq,
-           Hkv,
-           split_k,
-           hip_torch_mismatch);
+static void test_split_reduce(
+    int32_t padding,
+    int32_t batch_size,
+    int32_t Hq,
+    int32_t Hkv,
+    int32_t split_k) {
+  auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
+
+  auto [O_ref, m_ref, l_ref] =
+      split_attention_hip(XQ, K, V, seqlen, split_k, kWavefrontsPerBlock);
+
+  auto O_torch = split_reduce_torch(
+      O_ref, m_ref.unsqueeze(0), l_ref.unsqueeze(0), split_k);
+  auto O_hip = split_reduce_hip(O_ref, m_ref, l_ref, split_k);
+
+  auto hip_torch_mismatch = percent_mismatch(O_hip, O_torch);
+  printf(
+      "[Test split reduce] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched elements "
+      "percentage: %.2f \n",
+      padding,
+      batch_size,
+      Hq,
+      Hkv,
+      split_k,
+      hip_torch_mismatch);
 }
 
 static void test_splitk_decoder_e2e_correctness(
-    int32_t padding, int32_t batch_size, int32_t Hq, int32_t Hkv, int32_t split_k)
-{
-    auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
-
-    double qk_scale = 1. / sqrt(XQ.size(-1));
-
-    auto result = efficient_attention_forward_decoder_splitk_ck_impl<kThreadsPerWavefront,
-                                                                     kWavefrontsPerBlock>(
-        XQ, K, V, seqlen, qk_scale, split_k);
-    auto gold_result = efficient_attention_forward_decoder_splitk_torch(
-        XQ, K, V, seqlen, qk_scale, /* split_k */ 1, /* block_size */ 1);
-    auto e2e_mismatch = percent_mismatch(result, gold_result);
-    printf("[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched "
-           "elements percentage: %.2f\n",
-           padding,
-           batch_size,
-           Hq,
-           Hkv,
-           split_k,
-           e2e_mismatch);
+    int32_t padding,
+    int32_t batch_size,
+    int32_t Hq,
+    int32_t Hkv,
+    int32_t split_k) {
+  auto [XQ, K, V, seqlen] = generate_inputs(padding, batch_size, Hq, Hkv);
+
+  double qk_scale = 1. / sqrt(XQ.size(-1));
+
+  auto result = efficient_attention_forward_decoder_splitk_ck_impl<
+      kThreadsPerWavefront,
+      kWavefrontsPerBlock>(XQ, K, V, seqlen, qk_scale, split_k);
+  auto gold_result = efficient_attention_forward_decoder_splitk_torch(
+      XQ, K, V, seqlen, qk_scale, /* split_k */ 1, /* block_size */ 1);
+  auto e2e_mismatch = percent_mismatch(result, gold_result);
+  printf(
+      "[Test e2e split-k decoder] Padding=%d BS=%d Hq=%d Hkv=%d split_k=%d Mismatched "
+      "elements percentage: %.2f\n",
+      padding,
+      batch_size,
+      Hq,
+      Hkv,
+      split_k,
+      e2e_mismatch);
 }
 
-int main(int argc, char** argv)
-{
-    if(argc == 1)
-    {
-        for(auto padding : {32, 4096})
-        {
-            for(auto batch_size : {1, 8})
-            {
-                for(auto Hq : {16})
-                {
-                    for(auto Hkv : {16})
-                    {
-                        for(auto split_k : {1, 2, 4, 8, 16})
-                        {
-                            test_splitk_decoder_e2e_correctness(
-                                padding, batch_size, Hq, Hkv, split_k);
-                        }
-                    }
-                }
+int main(int argc, char** argv) {
+  if (argc == 1) {
+    for (auto padding : {32, 4096}) {
+      for (auto batch_size : {1, 8}) {
+        for (auto Hq : {16}) {
+          for (auto Hkv : {16}) {
+            for (auto split_k : {1, 2, 4, 8, 16}) {
+              test_splitk_decoder_e2e_correctness(
+                  padding, batch_size, Hq, Hkv, split_k);
             }
+          }
         }
+      }
+    }
 
-        for(auto padding : {32, 4096})
-        {
-            for(auto batch_size : {1, 8})
-            {
-                for(auto Hq : {16})
-                {
-                    for(auto Hkv : {16})
-                    {
-                        for(auto split_k : {1, 2, 4, 8, 16})
-                        {
-                            test_split_attention(padding, batch_size, Hq, Hkv, split_k);
-                        }
-                    }
-                }
+    for (auto padding : {32, 4096}) {
+      for (auto batch_size : {1, 8}) {
+        for (auto Hq : {16}) {
+          for (auto Hkv : {16}) {
+            for (auto split_k : {1, 2, 4, 8, 16}) {
+              test_split_attention(padding, batch_size, Hq, Hkv, split_k);
             }
+          }
         }
+      }
+    }
 
-        for(auto padding : {32, 4096})
-        {
-            for(auto batch_size : {1, 8})
-            {
-                for(auto Hq : {16})
-                {
-                    for(auto Hkv : {16})
-                    {
-                        for(auto split_k : {1, 2})
-                        {
-                            test_split_reduce(padding, batch_size, Hq, Hkv, split_k);
-                        }
-                    }
-                }
+    for (auto padding : {32, 4096}) {
+      for (auto batch_size : {1, 8}) {
+        for (auto Hq : {16}) {
+          for (auto Hkv : {16}) {
+            for (auto split_k : {1, 2}) {
+              test_split_reduce(padding, batch_size, Hq, Hkv, split_k);
             }
+          }
         }
+      }
     }
-    else
-    {
-        const auto args = std::vector<std::string>(argv + 1, argv + argc);
-        if(args.size() != 6)
-        {
-            std::cout << "Usage: ./a.out padding batch_size nq_heads nkv_heads dtype "
-                         "n_wavefronts_per_block"
-                      << std::endl;
-            return 0;
-        }
-        const int32_t padding    = std::stoi(args[0]);
-        const int32_t batch_size = std::stoi(args[1]);
-        const int32_t nq_heads   = std::stoi(args[2]);
-        const int32_t nkv_heads  = std::stoi(args[3]);
-        const auto dtype         = (args[4] == "f32")
-                               ? torch::kFloat32
-                               : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
-        const int32_t n_wavefronts_per_block = std::stoi(args[5]);
-
-        auto [Q, K, V, seq] = generate_inputs(padding, batch_size, nq_heads, nkv_heads, dtype);
-        auto O              = at::empty_like(Q);
-
-        constexpr auto splitk_dim = 0;
-        constexpr auto split_k    = 1;
-        auto O_splits             = at::stack(O, splitk_dim);
-
-        auto split_max    = at::empty({batch_size, padding, Q.size(2), Q.size(3), split_k},
-                                   Q.options().dtype(at::kFloat));
-        auto split_sumexp = at::empty_like(split_max);
-
-        const double qk_scale = 1. / sqrt(Q.size(-1));
-        auto call_ptr         = decltype(
-            &efficient_attention_forward_decoder_splitk_ck_out_impl<kThreadsPerWavefront,
-                                                                    kWavefrontsPerBlock>){};
-
-#define SWITCH_CASE_SET_CALLPTR(n)                                                              \
-    case(n):                                                                                    \
-        call_ptr =                                                                              \
-            &efficient_attention_forward_decoder_splitk_ck_out_impl<kThreadsPerWavefront, (n)>; \
-        break;
-
-        switch(n_wavefronts_per_block)
-        {
-            SWITCH_CASE_SET_CALLPTR(1);
-            SWITCH_CASE_SET_CALLPTR(2);
-            SWITCH_CASE_SET_CALLPTR(4);
-            SWITCH_CASE_SET_CALLPTR(8);
-            SWITCH_CASE_SET_CALLPTR(16);
+  } else {
+    const auto args = std::vector<std::string>(argv + 1, argv + argc);
+    if (args.size() != 6) {
+      std::cout << "Usage: ./a.out padding batch_size nq_heads nkv_heads dtype "
+                   "n_wavefronts_per_block"
+                << std::endl;
+      return 0;
+    }
+    const int32_t padding = std::stoi(args[0]);
+    const int32_t batch_size = std::stoi(args[1]);
+    const int32_t nq_heads = std::stoi(args[2]);
+    const int32_t nkv_heads = std::stoi(args[3]);
+    const auto dtype = (args[4] == "f32")
+        ? torch::kFloat32
+        : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const int32_t n_wavefronts_per_block = std::stoi(args[5]);
+
+    auto [Q, K, V, seq] =
+        generate_inputs(padding, batch_size, nq_heads, nkv_heads, dtype);
+    auto O = at::empty_like(Q);
+
+    constexpr auto splitk_dim = 0;
+    constexpr auto split_k = 1;
+    auto O_splits = at::stack(O, splitk_dim);
+
+    auto split_max = at::empty(
+        {batch_size, padding, Q.size(2), Q.size(3), split_k},
+        Q.options().dtype(at::kFloat));
+    auto split_sumexp = at::empty_like(split_max);
 
-        default: call_ptr = nullptr; break;
-        }
+    const double qk_scale = 1. / sqrt(Q.size(-1));
+    auto call_ptr =
+        decltype(&efficient_attention_forward_decoder_splitk_ck_out_impl<
+                 kThreadsPerWavefront,
+                 kWavefrontsPerBlock>){};
+
+#define SWITCH_CASE_SET_CALLPTR(n)                                      \
+  case (n):                                                             \
+    call_ptr = &efficient_attention_forward_decoder_splitk_ck_out_impl< \
+        kThreadsPerWavefront,                                           \
+        (n)>;                                                           \
+    break;
+
+    switch (n_wavefronts_per_block) {
+      SWITCH_CASE_SET_CALLPTR(1);
+      SWITCH_CASE_SET_CALLPTR(2);
+      SWITCH_CASE_SET_CALLPTR(4);
+      SWITCH_CASE_SET_CALLPTR(8);
+      SWITCH_CASE_SET_CALLPTR(16);
+
+      default:
+        call_ptr = nullptr;
+        break;
+    }
 #undef SWITCH_CASE_SET_CALLPTR
 
-        if(call_ptr)
-        {
-            call_ptr(Q, K, V, seq, qk_scale, split_k, split_max, split_sumexp, O_splits, O);
-        }
-        else
-        {
-            std::cout << "Warning: no kernel was found for wavefronts_per_block="
-                      << n_wavefronts_per_block << std::endl;
-        }
+    if (call_ptr) {
+      call_ptr(
+          Q,
+          K,
+          V,
+          seq,
+          qk_scale,
+          split_k,
+          split_max,
+          split_sumexp,
+          O_splits,
+          O);
+    } else {
+      std::cout << "Warning: no kernel was found for wavefronts_per_block="
+                << n_wavefronts_per_block << std::endl;
     }
-    return 0;
+  }
+  return 0;
 }
 
 #endif // MAIN
diff --git a/xformers/csrc/attention/hip_fmha/ck_align_switch.h b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
index f3dd9dbbe..9e7228355 100644
--- a/xformers/csrc/attention/hip_fmha/ck_align_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
@@ -9,163 +9,143 @@
 #include <stdexcept>
 
 // assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_1(CONST_ALIGN_MAX1, CONST_ALIGN_NAME1, LENGTH1, ...)                       \
-    [&] {                                                                                       \
-        if constexpr(CONST_ALIGN_MAX1 > 0)                                                      \
-        {                                                                                       \
-            if(LENGTH1 % CONST_ALIGN_MAX1 == 0)                                                 \
-            {                                                                                   \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;                     \
-                __VA_ARGS__();                                                                  \
-            }                                                                                   \
-            else                                                                                \
-            {                                                                                   \
-                if constexpr(CONST_ALIGN_MAX1 / 2 > 0)                                          \
-                {                                                                               \
-                    if(LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0)                                   \
-                    {                                                                           \
-                        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;         \
-                        __VA_ARGS__();                                                          \
-                    }                                                                           \
-                    else                                                                        \
-                    {                                                                           \
-                        if constexpr(CONST_ALIGN_MAX1 / 4 > 0)                                  \
-                        {                                                                       \
-                            if(LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0)                           \
-                            {                                                                   \
-                                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 4; \
-                                __VA_ARGS__();                                                  \
-                            }                                                                   \
-                            else                                                                \
-                            {                                                                   \
-                                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                    \
-                                __VA_ARGS__();                                                  \
-                            };                                                                  \
-                        }                                                                       \
-                    };                                                                          \
-                }                                                                               \
-            };                                                                                  \
-        }                                                                                       \
-    }()
+#define ALIGN_SWITCH_1(CONST_ALIGN_MAX1, CONST_ALIGN_NAME1, LENGTH1, ...)   \
+  [&] {                                                                     \
+    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
+      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
+        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
+        __VA_ARGS__();                                                      \
+      } else {                                                              \
+        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
+          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
+            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
+            __VA_ARGS__();                                                  \
+          } else {                                                          \
+            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
+              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
+                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
+                    CONST_ALIGN_MAX1 / 4;                                   \
+                __VA_ARGS__();                                              \
+              } else {                                                      \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
+                __VA_ARGS__();                                              \
+              };                                                            \
+            }                                                               \
+          };                                                                \
+        }                                                                   \
+      };                                                                    \
+    }                                                                       \
+  }()
 
 // assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_2(CONST_ALIGN_MAX1,                                                          \
-                       CONST_ALIGN_NAME1,                                                         \
-                       LENGTH1,                                                                   \
-                       CONST_ALIGN_MAX2,                                                          \
-                       CONST_ALIGN_NAME2,                                                         \
-                       LENGTH2,                                                                   \
-                       ...)                                                                       \
-    [&] {                                                                                         \
-        if constexpr(CONST_ALIGN_MAX1 > 0)                                                        \
-        {                                                                                         \
-            if(LENGTH1 % CONST_ALIGN_MAX1 == 0)                                                   \
-            {                                                                                     \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;                       \
-                ALIGN_SWITCH_1(CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);      \
-            }                                                                                     \
-            else                                                                                  \
-            {                                                                                     \
-                if constexpr(CONST_ALIGN_MAX1 / 2 > 0)                                            \
-                {                                                                                 \
-                    if(LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0)                                     \
-                    {                                                                             \
-                        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;           \
-                        ALIGN_SWITCH_1(                                                           \
-                            CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);         \
-                    }                                                                             \
-                    else                                                                          \
-                    {                                                                             \
-                        if constexpr(CONST_ALIGN_MAX1 / 4 > 0)                                    \
-                        {                                                                         \
-                            if(LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0)                             \
-                            {                                                                     \
-                                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 4;   \
-                                ALIGN_SWITCH_1(                                                   \
-                                    CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
-                            }                                                                     \
-                            else                                                                  \
-                            {                                                                     \
-                                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                      \
-                                ALIGN_SWITCH_1(                                                   \
-                                    CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
-                            };                                                                    \
-                        }                                                                         \
-                    };                                                                            \
-                }                                                                                 \
-            };                                                                                    \
-        }                                                                                         \
-    }()
+#define ALIGN_SWITCH_2(                                                       \
+    CONST_ALIGN_MAX1,                                                         \
+    CONST_ALIGN_NAME1,                                                        \
+    LENGTH1,                                                                  \
+    CONST_ALIGN_MAX2,                                                         \
+    CONST_ALIGN_NAME2,                                                        \
+    LENGTH2,                                                                  \
+    ...)                                                                      \
+  [&] {                                                                       \
+    if constexpr (CONST_ALIGN_MAX1 > 0) {                                     \
+      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                  \
+        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;           \
+        ALIGN_SWITCH_1(                                                       \
+            CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);     \
+      } else {                                                                \
+        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                             \
+          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                        \
+            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;   \
+            ALIGN_SWITCH_1(                                                   \
+                CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
+          } else {                                                            \
+            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                         \
+              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                    \
+                constexpr ck::index_t CONST_ALIGN_NAME1 =                     \
+                    CONST_ALIGN_MAX1 / 4;                                     \
+                ALIGN_SWITCH_1(                                               \
+                    CONST_ALIGN_MAX2,                                         \
+                    CONST_ALIGN_NAME2,                                        \
+                    LENGTH2,                                                  \
+                    ##__VA_ARGS__);                                           \
+              } else {                                                        \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                  \
+                ALIGN_SWITCH_1(                                               \
+                    CONST_ALIGN_MAX2,                                         \
+                    CONST_ALIGN_NAME2,                                        \
+                    LENGTH2,                                                  \
+                    ##__VA_ARGS__);                                           \
+              };                                                              \
+            }                                                                 \
+          };                                                                  \
+        }                                                                     \
+      };                                                                      \
+    }                                                                         \
+  }()
 
 // assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_3(CONST_ALIGN_MAX1,                                                        \
-                       CONST_ALIGN_NAME1,                                                       \
-                       LENGTH1,                                                                 \
-                       CONST_ALIGN_MAX2,                                                        \
-                       CONST_ALIGN_NAME2,                                                       \
-                       LENGTH2,                                                                 \
-                       CONST_ALIGN_MAX3,                                                        \
-                       CONST_ALIGN_NAME3,                                                       \
-                       LENGTH3,                                                                 \
-                       ...)                                                                     \
-    [&] {                                                                                       \
-        if constexpr(CONST_ALIGN_MAX1 > 0)                                                      \
-        {                                                                                       \
-            if(LENGTH1 % CONST_ALIGN_MAX1 == 0)                                                 \
-            {                                                                                   \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;                     \
-                ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                                \
-                               CONST_ALIGN_NAME2,                                               \
-                               LENGTH2,                                                         \
-                               CONST_ALIGN_MAX3,                                                \
-                               CONST_ALIGN_NAME3,                                               \
-                               LENGTH3,                                                         \
-                               ##__VA_ARGS__);                                                  \
-            }                                                                                   \
-            else                                                                                \
-            {                                                                                   \
-                if constexpr(CONST_ALIGN_MAX1 / 2 > 0)                                          \
-                {                                                                               \
-                    if(LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0)                                   \
-                    {                                                                           \
-                        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;         \
-                        ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                        \
-                                       CONST_ALIGN_NAME2,                                       \
-                                       LENGTH2,                                                 \
-                                       CONST_ALIGN_MAX3,                                        \
-                                       CONST_ALIGN_NAME3,                                       \
-                                       LENGTH3,                                                 \
-                                       ##__VA_ARGS__);                                          \
-                    }                                                                           \
-                    else                                                                        \
-                    {                                                                           \
-                        if constexpr(CONST_ALIGN_MAX1 / 4 > 0)                                  \
-                        {                                                                       \
-                            if(LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0)                           \
-                            {                                                                   \
-                                constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 4; \
-                                ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                \
-                                               CONST_ALIGN_NAME2,                               \
-                                               LENGTH2,                                         \
-                                               CONST_ALIGN_MAX3,                                \
-                                               CONST_ALIGN_NAME3,                               \
-                                               LENGTH3,                                         \
-                                               ##__VA_ARGS__);                                  \
-                            }                                                                   \
-                            else                                                                \
-                            {                                                                   \
-                                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                    \
-                                ALIGN_SWITCH_2(CONST_ALIGN_MAX2,                                \
-                                               CONST_ALIGN_NAME2,                               \
-                                               LENGTH2,                                         \
-                                               CONST_ALIGN_MAX3,                                \
-                                               CONST_ALIGN_NAME3,                               \
-                                               LENGTH3,                                         \
-                                               ##__VA_ARGS__);                                  \
-                            };                                                                  \
-                        }                                                                       \
-                    };                                                                          \
-                }                                                                               \
-            };                                                                                  \
-        }                                                                                       \
-    }()
+#define ALIGN_SWITCH_3(                                                     \
+    CONST_ALIGN_MAX1,                                                       \
+    CONST_ALIGN_NAME1,                                                      \
+    LENGTH1,                                                                \
+    CONST_ALIGN_MAX2,                                                       \
+    CONST_ALIGN_NAME2,                                                      \
+    LENGTH2,                                                                \
+    CONST_ALIGN_MAX3,                                                       \
+    CONST_ALIGN_NAME3,                                                      \
+    LENGTH3,                                                                \
+    ...)                                                                    \
+  [&] {                                                                     \
+    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
+      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
+        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
+        ALIGN_SWITCH_2(                                                     \
+            CONST_ALIGN_MAX2,                                               \
+            CONST_ALIGN_NAME2,                                              \
+            LENGTH2,                                                        \
+            CONST_ALIGN_MAX3,                                               \
+            CONST_ALIGN_NAME3,                                              \
+            LENGTH3,                                                        \
+            ##__VA_ARGS__);                                                 \
+      } else {                                                              \
+        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
+          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
+            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
+            ALIGN_SWITCH_2(                                                 \
+                CONST_ALIGN_MAX2,                                           \
+                CONST_ALIGN_NAME2,                                          \
+                LENGTH2,                                                    \
+                CONST_ALIGN_MAX3,                                           \
+                CONST_ALIGN_NAME3,                                          \
+                LENGTH3,                                                    \
+                ##__VA_ARGS__);                                             \
+          } else {                                                          \
+            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
+              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
+                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
+                    CONST_ALIGN_MAX1 / 4;                                   \
+                ALIGN_SWITCH_2(                                             \
+                    CONST_ALIGN_MAX2,                                       \
+                    CONST_ALIGN_NAME2,                                      \
+                    LENGTH2,                                                \
+                    CONST_ALIGN_MAX3,                                       \
+                    CONST_ALIGN_NAME3,                                      \
+                    LENGTH3,                                                \
+                    ##__VA_ARGS__);                                         \
+              } else {                                                      \
+                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
+                ALIGN_SWITCH_2(                                             \
+                    CONST_ALIGN_MAX2,                                       \
+                    CONST_ALIGN_NAME2,                                      \
+                    LENGTH2,                                                \
+                    CONST_ALIGN_MAX3,                                       \
+                    CONST_ALIGN_NAME3,                                      \
+                    LENGTH3,                                                \
+                    ##__VA_ARGS__);                                         \
+              };                                                            \
+            }                                                               \
+          };                                                                \
+        }                                                                   \
+      };                                                                    \
+    }                                                                       \
+  }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 6a7c60c0a..57d54eda2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -17,363 +17,334 @@
 namespace {
 
 template <typename data_t, int32_t vec_size>
-__device__ typename ck::vector_type<float, vec_size>::type
-scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc,
-                 typename ck::vector_type<data_t, vec_size>::type a,
-                 float b)
-{
-    union
-    {
-        decltype(acc) vec;
-        float arr[vec_size];
-    } acc_u{acc};
-    union
-    {
-        decltype(a) vec;
-        data_t arr[vec_size];
-    } a_u{a};
+__device__ typename ck::vector_type<float, vec_size>::type scalar_scale_acc(
+    typename ck::vector_type<float, vec_size>::type acc,
+    typename ck::vector_type<data_t, vec_size>::type a,
+    float b) {
+  union {
+    decltype(acc) vec;
+    float arr[vec_size];
+  } acc_u{acc};
+  union {
+    decltype(a) vec;
+    data_t arr[vec_size];
+  } a_u{a};
 
 #pragma unroll
-    for(int32_t i = 0; i < vec_size; ++i)
-    {
-        acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
-    }
+  for (int32_t i = 0; i < vec_size; ++i) {
+    acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
+  }
 
-    return acc_u.vec;
+  return acc_u.vec;
 }
 
 template <typename F, int32_t n_threads_per_wavefront = 64>
-float __device__ __forceinline__ wavefrontReduce(float val, F f)
-{
+float __device__ __forceinline__ wavefrontReduce(float val, F f) {
 #pragma unroll
-    for(int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1)
-    {
-        val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
-    }
-    return val;
+  for (int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1) {
+    val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
+  }
+  return val;
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void
-load_v(const TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec* __restrict__ load_to)
-{
-    *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+__forceinline__ __device__ void load_v(
+    const TData* __restrict__ data_ptr,
+    int32_t vector_offset,
+    TDataVec* __restrict__ load_to) {
+  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void
-store_v(TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec value)
-{
-    *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+__forceinline__ __device__ void store_v(
+    TData* __restrict__ data_ptr,
+    int32_t vector_offset,
+    TDataVec value) {
+  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
-template <typename scalar_t,
-          int32_t vec_size               = 4,
-          int32_t n_loop_unroll          = 16,
-          int32_t n_loop_unroll_tail     = 2,
-          int32_t KV_M_MAX               = 8192,
-          int32_t n_wavefronts_per_block = 16>
-__global__ void
-efficient_attention_forward_decoder_ck_kernel(const scalar_t* __restrict__ XQ,
-                                              const scalar_t* __restrict__ cache_K,
-                                              const scalar_t* __restrict__ cache_V,
-                                              scalar_t* __restrict__ O,
-                                              const int32_t* __restrict__ seq_kv_lens,
-                                              const ptrdiff_t XQ_stride_b,
-                                              const ptrdiff_t XQ_stride_m,
-                                              const ptrdiff_t XQ_stride_g,
-                                              const ptrdiff_t XQ_stride_h,
-                                              const ptrdiff_t K_stride_b,
-                                              const ptrdiff_t K_stride_m,
-                                              const ptrdiff_t K_stride_g,
-                                              const ptrdiff_t K_stride_h,
-                                              const int32_t Q_size_m,
-                                              const int32_t Q_size_g,
-                                              const int32_t Q_size_h,
-                                              const int32_t Q_size_k,
-                                              const int32_t K_size_m,
-                                              const bool multiquery,
-                                              const float qk_scale)
-{
-    static_assert(n_loop_unroll_tail < n_loop_unroll, "");
-
-    // Each block handles a single batch and head and query and group
-    const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
-    const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
-    const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
-    const int32_t h = blockIdx.x % Q_size_h;
-
-    // Note: this is decoding case where we attend to current and all previous
-    // tokens.
-    const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
-
-    const int32_t lane_idx              = threadIdx.x;
-    const int32_t wavefront_idx         = threadIdx.y;
-    const int32_t threads_per_wavefront = blockDim.x;
-    const int32_t wavefronts_per_block  = blockDim.y;
-    const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
-    const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
-    // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
-    const auto XQO_base_offset =
-        b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
-    const auto* __restrict__ q_ = XQ + XQO_base_offset;
-
-    const auto cache_KV_base_offset =
-        b * K_stride_b + 0 * K_stride_m + g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
-    const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
-    const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
-
-    using data_t        = scalar_t;
-    using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
-    using compute_t     = float;
-    using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
-
-    const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
-
-    extern __shared__ __align__(16) compute_t smem[];
-
-    data_vec_t q_thread = 0;
-    // Load Q into registers in all wavefronts.
-    // Each thread handles `vec_size` D dimensions
-    if(lane_active_for_io)
-    {
-        load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
-    }
-
-    compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
-
-    // Compute S[0:t_max] =
-    // ```
-    // for t in range(t_max):
-    //   S[t] = dot(Q, K[t])
-    // ```
-    // Split the 0:t_max range across wavefronts in a block,
-    // unroll loads to expose more parallelism.
-    // Reduce the dot product with cross-lane operation;
-    // Q and K[t] are in the registers of threads in a single wavefront.
-
-    data_vec_t k_loads[n_loop_unroll] = {};
-
-    constexpr auto dtt         = n_wavefronts_per_block * n_loop_unroll;
-    const int32_t t_max_unroll = (t_max / dtt) * dtt;
-
-    for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
-    {
-        if(lane_active_for_io)
-        {
+template <
+    typename scalar_t,
+    int32_t vec_size = 4,
+    int32_t n_loop_unroll = 16,
+    int32_t n_loop_unroll_tail = 2,
+    int32_t KV_M_MAX = 8192,
+    int32_t n_wavefronts_per_block = 16>
+__global__ void efficient_attention_forward_decoder_ck_kernel(
+    const scalar_t* __restrict__ XQ,
+    const scalar_t* __restrict__ cache_K,
+    const scalar_t* __restrict__ cache_V,
+    scalar_t* __restrict__ O,
+    const int32_t* __restrict__ seq_kv_lens,
+    const ptrdiff_t XQ_stride_b,
+    const ptrdiff_t XQ_stride_m,
+    const ptrdiff_t XQ_stride_g,
+    const ptrdiff_t XQ_stride_h,
+    const ptrdiff_t K_stride_b,
+    const ptrdiff_t K_stride_m,
+    const ptrdiff_t K_stride_g,
+    const ptrdiff_t K_stride_h,
+    const int32_t Q_size_m,
+    const int32_t Q_size_g,
+    const int32_t Q_size_h,
+    const int32_t Q_size_k,
+    const int32_t K_size_m,
+    const bool multiquery,
+    const float qk_scale) {
+  static_assert(n_loop_unroll_tail < n_loop_unroll, "");
+
+  // Each block handles a single batch and head and query and group
+  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+  const int32_t h = blockIdx.x % Q_size_h;
+
+  // Note: this is decoding case where we attend to current and all previous
+  // tokens.
+  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
+
+  const int32_t lane_idx = threadIdx.x;
+  const int32_t wavefront_idx = threadIdx.y;
+  const int32_t threads_per_wavefront = blockDim.x;
+  const int32_t wavefronts_per_block = blockDim.y;
+  const int32_t threads_per_block =
+      threads_per_wavefront * wavefronts_per_block;
+  const int32_t thread_linear_idx =
+      lane_idx + wavefront_idx * threads_per_wavefront;
+  // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
+  const auto XQO_base_offset =
+      b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
+  const auto* __restrict__ q_ = XQ + XQO_base_offset;
+
+  const auto cache_KV_base_offset = b * K_stride_b + 0 * K_stride_m +
+      g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
+  const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
+  const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
+
+  using data_t = scalar_t;
+  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
+  using compute_t = float;
+  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+  extern __shared__ __align__(16) compute_t smem[];
+
+  data_vec_t q_thread = 0;
+  // Load Q into registers in all wavefronts.
+  // Each thread handles `vec_size` D dimensions
+  if (lane_active_for_io) {
+    load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
+  }
+
+  compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+
+  // Compute S[0:t_max] =
+  // ```
+  // for t in range(t_max):
+  //   S[t] = dot(Q, K[t])
+  // ```
+  // Split the 0:t_max range across wavefronts in a block,
+  // unroll loads to expose more parallelism.
+  // Reduce the dot product with cross-lane operation;
+  // Q and K[t] are in the registers of threads in a single wavefront.
+
+  data_vec_t k_loads[n_loop_unroll] = {};
+
+  constexpr auto dtt = n_wavefronts_per_block * n_loop_unroll;
+  const int32_t t_max_unroll = (t_max / dtt) * dtt;
+
+  for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt) {
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                // load the K[b][t][g][h|0][:] row into registers
-                load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-            }
-        }
-        compute_t qk_accs[n_loop_unroll] = {};
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the K[b][t][g][h|0][:] row into registers
+        load_v<data_t, data_vec_t>(
+            cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+      }
+    }
+    compute_t qk_accs[n_loop_unroll] = {};
 #pragma unroll n_loop_unroll
-        for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-        {
-            ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                q_thread, k_loads[ttt], qk_accs[ttt]);
-            qk_accs[ttt] *= qk_scale;
-
-            qk_accs[ttt] = wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
-            max_qk_acc   = ck::math::max(qk_accs[ttt], max_qk_acc);
-        }
-        if(lane_idx == 0)
-        {
-            auto* __restrict__ smem_base = smem + tt;
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+          q_thread, k_loads[ttt], qk_accs[ttt]);
+      qk_accs[ttt] *= qk_scale;
+
+      qk_accs[ttt] =
+          wavefrontReduce(qk_accs[ttt], [](auto a, auto b) { return a + b; });
+      max_qk_acc = ck::math::max(qk_accs[ttt], max_qk_acc);
+    }
+    if (lane_idx == 0) {
+      auto* __restrict__ smem_base = smem + tt;
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                smem_base[ttt] = qk_accs[ttt];
-            }
-        }
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        smem_base[ttt] = qk_accs[ttt];
+      }
     }
+  }
 
-    // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
-    for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-        tt += wavefronts_per_block * n_loop_unroll_tail)
-    {
-        if(lane_active_for_io)
-        {
+  // NB: the length of the tail is <= (wavefronts_per_block * n_loop_unroll)
+  for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
+       tt += wavefronts_per_block * n_loop_unroll_tail) {
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    // load the K[b][t][g][h|0][:] row into registers
-                    load_v<data_t, data_vec_t>(
-                        cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                }
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the K[b][t][g][h|0][:] row into registers
+          load_v<data_t, data_vec_t>(
+              cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
         }
+      }
+    }
 #pragma unroll n_loop_unroll_tail
-        for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-        {
-            compute_t qk_acc = 0;
-            const int32_t t  = tt + ttt;
-            if(t < t_max)
-            {
-                ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                    q_thread, k_loads[ttt], qk_acc);
-                qk_acc *= qk_scale;
-
-                qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-                max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
-
-                // write accumulated sums to smem.
-                if(lane_idx == 0)
-                {
-                    smem[t] = qk_acc;
-                }
-            }
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      compute_t qk_acc = 0;
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+            q_thread, k_loads[ttt], qk_acc);
+        qk_acc *= qk_scale;
+
+        qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+        max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
+
+        // write accumulated sums to smem.
+        if (lane_idx == 0) {
+          smem[t] = qk_acc;
         }
+      }
     }
-
-    // Use shared reduction to compute max and compute softmax on shared memory.
-    // write max acc
-    if(lane_idx == 0)
-    {
-        smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
-    }
-    __syncthreads();
-    if(lane_idx < wavefronts_per_block)
-    {
-        max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
-    }
-    // shared across all threads in block
-    max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
-
-    // each wavefront computes partial sum of exp.
-    compute_t softmax_denominator = 0.0f;
-    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
-    {
-        softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
-    }
-    softmax_denominator =
-        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
-
-    if(lane_idx == 0)
-    {
-        smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
-    }
-    __syncthreads();
-
-    // now, compute sum of exp(x - max(x)) over all intermediate results.
-    softmax_denominator = 0.0;
-    if(lane_idx < wavefronts_per_block)
-    {
-        softmax_denominator = smem[KV_M_MAX + lane_idx];
-    }
-    softmax_denominator =
-        wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
-
-    const compute_t softmax_scale_factor = 1. / softmax_denominator;
-    // now, compute the normalization across all threads.
-    for(int32_t t = thread_linear_idx; t < t_max; t += threads_per_block)
-    {
-        smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
-    }
-    __syncthreads();
-
-    // Split T across wavefronts in a block
-    // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-    // outputs are of size float[D]
-
-    compute_t ps[n_loop_unroll] = {};
-    compute_vec_t o_acc         = 0;
-    if(lane_active_for_io)
-    {
-        for(auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll; tt += dtt)
-        {
+  }
+
+  // Use shared reduction to compute max and compute softmax on shared memory.
+  // write max acc
+  if (lane_idx == 0) {
+    smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+  }
+  __syncthreads();
+  if (lane_idx < wavefronts_per_block) {
+    max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
+  }
+  // shared across all threads in block
+  max_qk_acc =
+      wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+
+  // each wavefront computes partial sum of exp.
+  compute_t softmax_denominator = 0.0f;
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
+    softmax_denominator += ck::math::exp(smem[t] - max_qk_acc);
+  }
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](auto a, auto b) { return a + b; });
+
+  if (lane_idx == 0) {
+    smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
+  }
+  __syncthreads();
+
+  // now, compute sum of exp(x - max(x)) over all intermediate results.
+  softmax_denominator = 0.0;
+  if (lane_idx < wavefronts_per_block) {
+    softmax_denominator = smem[KV_M_MAX + lane_idx];
+  }
+  softmax_denominator = wavefrontReduce(
+      softmax_denominator, [](auto a, auto b) { return a + b; });
+
+  const compute_t softmax_scale_factor = 1. / softmax_denominator;
+  // now, compute the normalization across all threads.
+  for (int32_t t = thread_linear_idx; t < t_max; t += threads_per_block) {
+    smem[t] = ck::math::exp(smem[t] - max_qk_acc) * softmax_scale_factor;
+  }
+  __syncthreads();
+
+  // Split T across wavefronts in a block
+  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+  // outputs are of size float[D]
+
+  compute_t ps[n_loop_unroll] = {};
+  compute_vec_t o_acc = 0;
+  if (lane_active_for_io) {
+    for (auto tt = wavefront_idx * n_loop_unroll; tt < t_max_unroll;
+         tt += dtt) {
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                // load the V[b][t][g][h|0][:] row into registers, reusing K register
-                // storage
-                load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                ps[ttt] = smem[t];
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the V[b][t][g][h|0][:] row into registers, reusing K register
+        // storage
+        load_v<data_t, data_vec_t>(
+            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+        ps[ttt] = smem[t];
+      }
 
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-            }
-        }
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        o_acc =
+            scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+      }
+    }
 
-        for(auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail; tt < t_max;
-            tt += wavefronts_per_block * n_loop_unroll_tail)
-        {
+    for (auto tt = t_max_unroll + wavefront_idx * n_loop_unroll_tail;
+         tt < t_max;
+         tt += wavefronts_per_block * n_loop_unroll_tail) {
 #pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    // load the V[b][t][g][h|0][:] row into registers, reusing K register
-                    // storage
-                    load_v<data_t, data_vec_t>(
-                        cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                    ps[ttt] = smem[t];
-                }
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the V[b][t][g][h|0][:] row into registers, reusing K register
+          // storage
+          load_v<data_t, data_vec_t>(
+              cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+          ps[ttt] = smem[t];
+        }
+      }
 
 #pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-                }
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          o_acc =
+              scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
         }
+      }
     }
-    // now, each thread has partial sums. Write to smem and get accumulated
-    // results back.
-    __syncthreads();
-
-    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
-    if(lane_active_for_io)
-    {
-        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  }
+  // now, each thread has partial sums. Write to smem and get accumulated
+  // results back.
+  __syncthreads();
+
+  // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+  if (lane_active_for_io) {
+    store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  }
+
+  __syncthreads();
+  // sum up partial D rows from other wavefronts
+  if (wavefront_idx == 0 && lane_active_for_io) {
+    union {
+      compute_vec_t vec = 0;
+      compute_t arr[vec_size];
+    } r;
+    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
+      compute_vec_t partial_r;
+      load_v<compute_t, compute_vec_t>(
+          smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      r.vec += partial_r;
     }
-
-    __syncthreads();
-    // sum up partial D rows from other wavefronts
-    if(wavefront_idx == 0 && lane_active_for_io)
-    {
-        union
-        {
-            compute_vec_t vec = 0;
-            compute_t arr[vec_size];
-        } r;
-        for(int32_t w = 0; w < wavefronts_per_block; ++w)
-        {
-            compute_vec_t partial_r;
-            load_v<compute_t, compute_vec_t>(
-                smem, w * threads_per_wavefront + lane_idx, &partial_r);
-            r.vec += partial_r;
-        }
-        // elementwise convert from compute_t result to data_t out to be written
-        union
-        {
-            data_vec_t vec;
-            data_t arr[vec_size];
-        } bf_r;
+    // elementwise convert from compute_t result to data_t out to be written
+    union {
+      data_vec_t vec;
+      data_t arr[vec_size];
+    } bf_r;
 #pragma unroll
-        for(int32_t i = 0; i < vec_size; ++i)
-        {
-            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
-        }
-        // write output row O[b][m][g][h][:]
-        data_t* __restrict__ o_ = O + XQO_base_offset;
-        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+    for (int32_t i = 0; i < vec_size; ++i) {
+      bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }
+    // write output row O[b][m][g][h][:]
+    data_t* __restrict__ o_ = O + XQO_base_offset;
+    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+  }
 }
 
 } // namespace
@@ -382,147 +353,142 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 template <typename scalar_t>
-struct FMHADecoderSeqlen1DeviceOp : public BaseOperator
-{
-    using DeviceOp = FMHADecoderSeqlen1DeviceOp;
-    struct Argument : public BaseArgument
-    {
-        const scalar_t* __restrict__ XQ;
-        const scalar_t* __restrict__ cache_K;
-        const scalar_t* __restrict__ cache_V;
-        scalar_t* __restrict__ O;
-        const int32_t* __restrict__ seq_kv_lens;
-        const ptrdiff_t XQ_stride_b;
-        const ptrdiff_t XQ_stride_m;
-        const ptrdiff_t XQ_stride_g;
-        const ptrdiff_t XQ_stride_h;
-        const ptrdiff_t K_stride_b;
-        const ptrdiff_t K_stride_m;
-        const ptrdiff_t K_stride_g;
-        const ptrdiff_t K_stride_h;
-        const int32_t Q_size_m;
-        const int32_t Q_size_g;
-        const int32_t Q_size_h;
-        const int32_t Q_size_k;
-        const int32_t K_size_m;
-        const bool multiquery;
-        const float qk_scale;
-
-        const dim3 grid_dim;
-        const dim3 block_dim;
-        const size_t lds_bytes;
-
-        Argument(const scalar_t* __restrict__ XQ,
-                 const scalar_t* __restrict__ cache_K,
-                 const scalar_t* __restrict__ cache_V,
-                 scalar_t* __restrict__ O,
-                 const int32_t* __restrict__ seq_kv_lens,
-                 const ptrdiff_t XQ_stride_b,
-                 const ptrdiff_t XQ_stride_m,
-                 const ptrdiff_t XQ_stride_g,
-                 const ptrdiff_t XQ_stride_h,
-                 const ptrdiff_t K_stride_b,
-                 const ptrdiff_t K_stride_m,
-                 const ptrdiff_t K_stride_g,
-                 const ptrdiff_t K_stride_h,
-                 const int32_t Q_size_m,
-                 const int32_t Q_size_g,
-                 const int32_t Q_size_h,
-                 const int32_t Q_size_k,
-                 const int32_t K_size_m,
-                 const bool multiquery,
-                 const float qk_scale,
-                 const dim3 grid_dim,
-                 const dim3 block_dim,
-                 const size_t lds_bytes)
-            : XQ(XQ),
-              cache_K(cache_K),
-              cache_V(cache_V),
-              O(O),
-              seq_kv_lens(seq_kv_lens),
-              XQ_stride_b(XQ_stride_b),
-              XQ_stride_m(XQ_stride_m),
-              XQ_stride_g(XQ_stride_g),
-              XQ_stride_h(XQ_stride_h),
-              K_stride_b(K_stride_b),
-              K_stride_m(K_stride_m),
-              K_stride_g(K_stride_g),
-              K_stride_h(K_stride_h),
-              Q_size_m(Q_size_m),
-              Q_size_g(Q_size_g),
-              Q_size_h(Q_size_h),
-              Q_size_k(Q_size_k),
-              K_size_m(K_size_m),
-              multiquery(multiquery),
-              qk_scale(qk_scale),
-              grid_dim(grid_dim),
-              block_dim(block_dim),
-              lds_bytes(lds_bytes)
-        {
-        }
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            auto threads_per_wavefront = arg.block_dim.x;
-
-            auto Q_size_k_alignment_necessary = 0;
-
-            for(auto vec_size : {4, 2, 1})
-            {
-                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
-                {
-                    Q_size_k_alignment_necessary = vec_size;
-                }
-            }
-
-            if(!Q_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported Q_size_k");
-            }
-
-            if(arg.Q_size_k % Q_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported alignment for Q_size_k");
-            }
-
-            return launch_and_time_kernel(
-                stream_config,
-                Q_size_k_alignment_necessary == 4
-                    ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
-                    : Q_size_k_alignment_necessary == 2
-                          ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                          : Q_size_k_alignment_necessary == 1
-                                ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-                                : nullptr,
-                arg.grid_dim,
-                arg.block_dim,
-                arg.lds_bytes,
-                arg.XQ,
-                arg.cache_K,
-                arg.cache_V,
-                arg.O,
-                arg.seq_kv_lens,
-                arg.XQ_stride_b,
-                arg.XQ_stride_m,
-                arg.XQ_stride_g,
-                arg.XQ_stride_h,
-                arg.K_stride_b,
-                arg.K_stride_m,
-                arg.K_stride_g,
-                arg.K_stride_h,
-                arg.Q_size_m,
-                arg.Q_size_g,
-                arg.Q_size_h,
-                arg.Q_size_k,
-                arg.K_size_m,
-                arg.multiquery,
-                arg.qk_scale);
+struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSeqlen1DeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    const int32_t* __restrict__ seq_kv_lens;
+    const ptrdiff_t XQ_stride_b;
+    const ptrdiff_t XQ_stride_m;
+    const ptrdiff_t XQ_stride_g;
+    const ptrdiff_t XQ_stride_h;
+    const ptrdiff_t K_stride_b;
+    const ptrdiff_t K_stride_m;
+    const ptrdiff_t K_stride_g;
+    const ptrdiff_t K_stride_h;
+    const int32_t Q_size_m;
+    const int32_t Q_size_g;
+    const int32_t Q_size_h;
+    const int32_t Q_size_k;
+    const int32_t K_size_m;
+    const bool multiquery;
+    const float qk_scale;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        const int32_t* __restrict__ seq_kv_lens,
+        const ptrdiff_t XQ_stride_b,
+        const ptrdiff_t XQ_stride_m,
+        const ptrdiff_t XQ_stride_g,
+        const ptrdiff_t XQ_stride_h,
+        const ptrdiff_t K_stride_b,
+        const ptrdiff_t K_stride_m,
+        const ptrdiff_t K_stride_g,
+        const ptrdiff_t K_stride_h,
+        const int32_t Q_size_m,
+        const int32_t Q_size_g,
+        const int32_t Q_size_h,
+        const int32_t Q_size_k,
+        const int32_t K_size_m,
+        const bool multiquery,
+        const float qk_scale,
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          seq_kv_lens(seq_kv_lens),
+          XQ_stride_b(XQ_stride_b),
+          XQ_stride_m(XQ_stride_m),
+          XQ_stride_g(XQ_stride_g),
+          XQ_stride_h(XQ_stride_h),
+          K_stride_b(K_stride_b),
+          K_stride_m(K_stride_m),
+          K_stride_g(K_stride_g),
+          K_stride_h(K_stride_h),
+          Q_size_m(Q_size_m),
+          Q_size_g(Q_size_g),
+          Q_size_h(Q_size_h),
+          Q_size_k(Q_size_k),
+          K_size_m(K_size_m),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+
+      auto Q_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+          Q_size_k_alignment_necessary = vec_size;
         }
-    };
+      }
+
+      if (!Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported Q_size_k");
+      }
+
+      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for Q_size_k");
+      }
+
+      return launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
+              : Q_size_k_alignment_necessary == 2
+                  ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.O,
+          arg.seq_kv_lens,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.K_stride_b,
+          arg.K_stride_m,
+          arg.K_stride_g,
+          arg.K_stride_h,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.K_size_m,
+          arg.multiquery,
+          arg.qk_scale);
+    }
+  };
 };
 } // namespace device
 } // namespace tensor_operation
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index cd25f4ce6..acb1a0154 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -11,54 +11,50 @@
 namespace {
 
 template <typename data_t, int32_t vec_size>
-__device__ typename ck::vector_type<float, vec_size>::type
-scalar_scale_acc(typename ck::vector_type<float, vec_size>::type acc,
-                 typename ck::vector_type<data_t, vec_size>::type a,
-                 float b)
-{
-    union
-    {
-        decltype(acc) vec;
-        float arr[vec_size];
-    } acc_u{acc};
-    union
-    {
-        decltype(a) vec;
-        data_t arr[vec_size];
-    } a_u{a};
+__device__ typename ck::vector_type<float, vec_size>::type scalar_scale_acc(
+    typename ck::vector_type<float, vec_size>::type acc,
+    typename ck::vector_type<data_t, vec_size>::type a,
+    float b) {
+  union {
+    decltype(acc) vec;
+    float arr[vec_size];
+  } acc_u{acc};
+  union {
+    decltype(a) vec;
+    data_t arr[vec_size];
+  } a_u{a};
 
 #pragma unroll
-    for(int32_t i = 0; i < vec_size; ++i)
-    {
-        acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
-    }
+  for (int32_t i = 0; i < vec_size; ++i) {
+    acc_u.arr[i] += ck::type_convert<float>(a_u.arr[i]) * b;
+  }
 
-    return acc_u.vec;
+  return acc_u.vec;
 }
 
 template <typename F, int32_t n_threads_per_wavefront = 64>
-float __device__ __forceinline__ wavefrontReduce(float val, F f)
-{
+float __device__ __forceinline__ wavefrontReduce(float val, F f) {
 #pragma unroll
-    for(int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1)
-    {
-        val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
-    }
-    return val;
+  for (int32_t mask = n_threads_per_wavefront >> 1; mask > 0; mask >>= 1) {
+    val = f(__shfl_xor(val, mask, n_threads_per_wavefront), val);
+  }
+  return val;
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void
-load_v(const TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec* __restrict__ load_to)
-{
-    *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
+__forceinline__ __device__ void load_v(
+    const TData* __restrict__ data_ptr,
+    int32_t vector_offset,
+    TDataVec* __restrict__ load_to) {
+  *load_to = *(reinterpret_cast<const TDataVec*>(data_ptr) + vector_offset);
 }
 
 template <typename TData, typename TDataVec>
-__forceinline__ __device__ void
-store_v(TData* __restrict__ data_ptr, int32_t vector_offset, TDataVec value)
-{
-    *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
+__forceinline__ __device__ void store_v(
+    TData* __restrict__ data_ptr,
+    int32_t vector_offset,
+    TDataVec value) {
+  *(reinterpret_cast<TDataVec*>(data_ptr) + vector_offset) = value;
 }
 
 template <typename scalar_t, int32_t vec_size = 4, typename compute_t = float>
@@ -76,404 +72,378 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
     const ptrdiff_t O_stride_m,
     const ptrdiff_t O_stride_g,
     const ptrdiff_t O_stride_h,
-    const int32_t split_k)
-{
-
-    // Each block handles a single batch and head and query and group
-    const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
-    const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
-    const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
-    const int32_t h = blockIdx.x % Q_size_h;
-
-    using data_t        = scalar_t;
-    using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
-    using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
-
-    union
-    {
-        data_vec_t vec;
-        data_t arr[vec_size];
-    } O_split_data;
-    union
-    {
-        compute_vec_t vec;
-        compute_t arr[vec_size];
-    } O_split_compute;
-    union
-    {
-        data_vec_t vec;
-        data_t arr[vec_size];
-    } global_O_data;
-    union
-    {
-        compute_vec_t vec;
-        compute_t arr[vec_size];
-    } global_O_compute;
-
-    global_O_compute.vec = 0;
-
-    const int32_t lane_idx        = threadIdx.x;
-    const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
-
-    if(!lane_active_for_io)
-    {
-        return;
-    }
-
-    compute_t global_sumexp = 0;
-    compute_t global_max    = ck::NumericLimits<compute_t>::Lowest();
-
-    for(int32_t split_idx = 0; split_idx < split_k; ++split_idx)
-    {
-        load_v<data_t, data_vec_t>(O_splits + b * O_stride_b + m * O_stride_m + g * O_stride_g +
-                                       h * O_stride_h + split_idx * O_stride_split,
-                                   lane_idx,
-                                   &O_split_data.vec);
+    const int32_t split_k) {
+  // Each block handles a single batch and head and query and group
+  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+  const int32_t h = blockIdx.x % Q_size_h;
+
+  using data_t = scalar_t;
+  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
+  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+  union {
+    data_vec_t vec;
+    data_t arr[vec_size];
+  } O_split_data;
+  union {
+    compute_vec_t vec;
+    compute_t arr[vec_size];
+  } O_split_compute;
+  union {
+    data_vec_t vec;
+    data_t arr[vec_size];
+  } global_O_data;
+  union {
+    compute_vec_t vec;
+    compute_t arr[vec_size];
+  } global_O_compute;
+
+  global_O_compute.vec = 0;
+
+  const int32_t lane_idx = threadIdx.x;
+  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+  if (!lane_active_for_io) {
+    return;
+  }
+
+  compute_t global_sumexp = 0;
+  compute_t global_max = ck::NumericLimits<compute_t>::Lowest();
+
+  for (int32_t split_idx = 0; split_idx < split_k; ++split_idx) {
+    load_v<data_t, data_vec_t>(
+        O_splits + b * O_stride_b + m * O_stride_m + g * O_stride_g +
+            h * O_stride_h + split_idx * O_stride_split,
+        lane_idx,
+        &O_split_data.vec);
 #pragma unroll
-        for(int32_t i = 0; i < vec_size; ++i)
-        {
-            O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
-        }
-        compute_t local_max    = *(split_max + blockIdx.x * split_k + split_idx);
-        compute_t local_sumexp = *(split_sumexp + blockIdx.x * split_k + split_idx);
-
-        compute_t log_alpha = -std::abs(local_max - global_max);
-        compute_t alpha     = isnan(log_alpha) ? compute_t{1.} : ck::math::exp(log_alpha);
-
-        bool pick_new               = local_max < global_max;
-        compute_t pick_current_coef = pick_new ? 1. : alpha;
-        compute_t pick_new_coef     = pick_new ? alpha : 1.;
-
-        global_sumexp = pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
-        global_O_compute.vec =
-            pick_current_coef * global_O_compute.vec + pick_new_coef * O_split_compute.vec;
-        global_max = ck::math::max(local_max, global_max);
+    for (int32_t i = 0; i < vec_size; ++i) {
+      O_split_compute.arr[i] = ck::type_convert<compute_t>(O_split_data.arr[i]);
     }
-    global_O_compute.vec /= global_sumexp;
+    compute_t local_max = *(split_max + blockIdx.x * split_k + split_idx);
+    compute_t local_sumexp = *(split_sumexp + blockIdx.x * split_k + split_idx);
+
+    compute_t log_alpha = -std::abs(local_max - global_max);
+    compute_t alpha =
+        isnan(log_alpha) ? compute_t{1.} : ck::math::exp(log_alpha);
+
+    bool pick_new = local_max < global_max;
+    compute_t pick_current_coef = pick_new ? 1. : alpha;
+    compute_t pick_new_coef = pick_new ? alpha : 1.;
+
+    global_sumexp =
+        pick_current_coef * global_sumexp + pick_new_coef * local_sumexp;
+    global_O_compute.vec = pick_current_coef * global_O_compute.vec +
+        pick_new_coef * O_split_compute.vec;
+    global_max = ck::math::max(local_max, global_max);
+  }
+  global_O_compute.vec /= global_sumexp;
 #pragma unroll
-    for(int32_t i = 0; i < vec_size; ++i)
-    {
-        global_O_data.arr[i] = ck::type_convert<data_t>(global_O_compute.arr[i]);
-    }
-    store_v<data_t, data_vec_t>(O + b * O_stride_b + m * O_stride_m + g * O_stride_g +
-                                    h * O_stride_h,
-                                lane_idx,
-                                global_O_data.vec);
+  for (int32_t i = 0; i < vec_size; ++i) {
+    global_O_data.arr[i] = ck::type_convert<data_t>(global_O_compute.arr[i]);
+  }
+  store_v<data_t, data_vec_t>(
+      O + b * O_stride_b + m * O_stride_m + g * O_stride_g + h * O_stride_h,
+      lane_idx,
+      global_O_data.vec);
 }
 
-template <typename scalar_t,
-          int32_t vec_size           = 4,
-          int32_t n_loop_unroll      = 16,
-          int32_t n_loop_unroll_tail = 2,
-          int32_t KV_M_MAX           = 8192,
-          typename compute_t         = float>
-__global__ void
-efficient_attention_forward_decoder_splitk_ck_kernel(const scalar_t* __restrict__ XQ,
-                                                     const scalar_t* __restrict__ cache_K,
-                                                     const scalar_t* __restrict__ cache_V,
-                                                     scalar_t* __restrict__ O_splits,
-                                                     compute_t* __restrict__ split_max,
-                                                     compute_t* __restrict__ split_sumexp,
-                                                     const int32_t* __restrict__ seq_kv_lens,
-                                                     const ptrdiff_t XQ_stride_b,
-                                                     const ptrdiff_t XQ_stride_m,
-                                                     const ptrdiff_t XQ_stride_g,
-                                                     const ptrdiff_t XQ_stride_h,
-                                                     const ptrdiff_t K_stride_b,
-                                                     const ptrdiff_t K_stride_m,
-                                                     const ptrdiff_t K_stride_g,
-                                                     const ptrdiff_t K_stride_h,
-                                                     const ptrdiff_t O_stride_split,
-                                                     const int32_t Q_size_m,
-                                                     const int32_t Q_size_g,
-                                                     const int32_t Q_size_h,
-                                                     const int32_t Q_size_k,
-                                                     const int32_t K_size_m,
-                                                     const bool multiquery,
-                                                     const float qk_scale,
-                                                     const int32_t split_k)
-{
-    static_assert(n_loop_unroll_tail < n_loop_unroll || n_loop_unroll_tail == 1,
-                  "tail unroll must be smaller than main loop untoll; pragma unroll 0 is illegal "
-                  "(and tail is no-op)");
-
-    // Each block handles a single batch and head and query and group
-    const int32_t b         = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
-    const int32_t m         = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
-    const int32_t g         = (blockIdx.x / Q_size_h) % Q_size_g;
-    const int32_t h         = blockIdx.x % Q_size_h;
-    const int32_t split_idx = blockIdx.y;
-
-    // Note: this is decoding case where we attend to current and all previous
-    // tokens.
-    const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
-
-    const int32_t lane_idx      = threadIdx.x;
-    const int32_t wavefront_idx = threadIdx.y;
-    // TODO: `threads_per_wavefront` and `wavefronts_per_block` may be compile time constants;
-    // investigate when optimizing
-    const int32_t threads_per_wavefront = blockDim.x;
-    const int32_t wavefronts_per_block  = blockDim.y;
-    const int32_t threads_per_block     = threads_per_wavefront * wavefronts_per_block;
-    const int32_t thread_linear_idx     = lane_idx + wavefront_idx * threads_per_wavefront;
-    // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
-    const auto XQO_base_offset =
-        b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
-    const auto* __restrict__ q_ = XQ + XQO_base_offset;
-
-    const auto cache_KV_base_offset =
-        b * K_stride_b + 0 * K_stride_m + g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
-    const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
-    const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
-
-    using data_t        = scalar_t;
-    using data_vec_t    = typename ck::vector_type<data_t, vec_size>::type;
-    using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
-
-    const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
-
-    extern __shared__ __align__(16) compute_t smem[];
-
-    data_vec_t q_thread = 0;
-    // Load Q into registers in all wavefronts.
-    // Each thread handles `vec_size` D dimensions
-    if(lane_active_for_io)
-    {
-        load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
-    }
-
-    compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
-
-    // Compute S[0:t_max] =
-    // ```
-    // for t in range(t_max):
-    //   S[t] = dot(Q, K[t])
-    // ```
-    // Split the 0:t_max range across wavefronts in a block,
-    // unroll loads to expose more parallelism.
-    // Reduce the dot product with cross-lane operation;
-    // Q and K[t] are in the registers of threads in a single wavefront.
-
-    data_vec_t k_loads[n_loop_unroll] = {};
-
-    const auto dtt = wavefronts_per_block * n_loop_unroll;
-    // only last split gets the tail.
-    // the first (split_k - 1) splits have a number of iterations divisible by `dtt`
-    const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
-    const int32_t tt_low = wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
-    const int32_t tt_high =
-        wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * (split_idx + 1);
-    const int32_t dtt_tail = wavefronts_per_block * n_loop_unroll_tail;
-    const int32_t tt_tail_low =
-        wavefront_idx * n_loop_unroll_tail + n_unrolled_loops * dtt * (split_idx + 1);
-    const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
-
-    for(auto tt = tt_low; tt < tt_high; tt += dtt)
-    {
-        if(lane_active_for_io)
-        {
+template <
+    typename scalar_t,
+    int32_t vec_size = 4,
+    int32_t n_loop_unroll = 16,
+    int32_t n_loop_unroll_tail = 2,
+    int32_t KV_M_MAX = 8192,
+    typename compute_t = float>
+__global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
+    const scalar_t* __restrict__ XQ,
+    const scalar_t* __restrict__ cache_K,
+    const scalar_t* __restrict__ cache_V,
+    scalar_t* __restrict__ O_splits,
+    compute_t* __restrict__ split_max,
+    compute_t* __restrict__ split_sumexp,
+    const int32_t* __restrict__ seq_kv_lens,
+    const ptrdiff_t XQ_stride_b,
+    const ptrdiff_t XQ_stride_m,
+    const ptrdiff_t XQ_stride_g,
+    const ptrdiff_t XQ_stride_h,
+    const ptrdiff_t K_stride_b,
+    const ptrdiff_t K_stride_m,
+    const ptrdiff_t K_stride_g,
+    const ptrdiff_t K_stride_h,
+    const ptrdiff_t O_stride_split,
+    const int32_t Q_size_m,
+    const int32_t Q_size_g,
+    const int32_t Q_size_h,
+    const int32_t Q_size_k,
+    const int32_t K_size_m,
+    const bool multiquery,
+    const float qk_scale,
+    const int32_t split_k) {
+  static_assert(
+      n_loop_unroll_tail < n_loop_unroll || n_loop_unroll_tail == 1,
+      "tail unroll must be smaller than main loop untoll; pragma unroll 0 is illegal "
+      "(and tail is no-op)");
+
+  // Each block handles a single batch and head and query and group
+  const int32_t b = blockIdx.x / (Q_size_m * Q_size_g * Q_size_h);
+  const int32_t m = (blockIdx.x / (Q_size_g * Q_size_h)) % Q_size_m;
+  const int32_t g = (blockIdx.x / Q_size_h) % Q_size_g;
+  const int32_t h = blockIdx.x % Q_size_h;
+  const int32_t split_idx = blockIdx.y;
+
+  // Note: this is decoding case where we attend to current and all previous
+  // tokens.
+  const int32_t t_max = seq_kv_lens ? seq_kv_lens[b] : K_size_m;
+
+  const int32_t lane_idx = threadIdx.x;
+  const int32_t wavefront_idx = threadIdx.y;
+  // TODO: `threads_per_wavefront` and `wavefronts_per_block` may be compile
+  // time constants; investigate when optimizing
+  const int32_t threads_per_wavefront = blockDim.x;
+  const int32_t wavefronts_per_block = blockDim.y;
+  const int32_t threads_per_block =
+      threads_per_wavefront * wavefronts_per_block;
+  const int32_t thread_linear_idx =
+      lane_idx + wavefront_idx * threads_per_wavefront;
+  // const auto* q_ = &(XQ_acc[b][m][g][h][0]);
+  const auto XQO_base_offset =
+      b * XQ_stride_b + m * XQ_stride_m + g * XQ_stride_g + h * XQ_stride_h;
+  const auto* __restrict__ q_ = XQ + XQO_base_offset;
+
+  const auto cache_KV_base_offset = b * K_stride_b + 0 * K_stride_m +
+      g * K_stride_g + (multiquery ? 0 : h * K_stride_h);
+  const auto* __restrict__ cache_K_base = cache_K + cache_KV_base_offset;
+  const auto* __restrict__ cache_V_base = cache_V + cache_KV_base_offset;
+
+  using data_t = scalar_t;
+  using data_vec_t = typename ck::vector_type<data_t, vec_size>::type;
+  using compute_vec_t = typename ck::vector_type<compute_t, vec_size>::type;
+
+  const bool lane_active_for_io = lane_idx * vec_size < Q_size_k;
+
+  extern __shared__ __align__(16) compute_t smem[];
+
+  data_vec_t q_thread = 0;
+  // Load Q into registers in all wavefronts.
+  // Each thread handles `vec_size` D dimensions
+  if (lane_active_for_io) {
+    load_v<data_t, data_vec_t>(q_, lane_idx, &q_thread);
+  }
+
+  compute_t max_qk_acc = ck::NumericLimits<compute_t>::Lowest();
+
+  // Compute S[0:t_max] =
+  // ```
+  // for t in range(t_max):
+  //   S[t] = dot(Q, K[t])
+  // ```
+  // Split the 0:t_max range across wavefronts in a block,
+  // unroll loads to expose more parallelism.
+  // Reduce the dot product with cross-lane operation;
+  // Q and K[t] are in the registers of threads in a single wavefront.
+
+  data_vec_t k_loads[n_loop_unroll] = {};
+
+  const auto dtt = wavefronts_per_block * n_loop_unroll;
+  // only last split gets the tail.
+  // the first (split_k - 1) splits have a number of iterations divisible by
+  // `dtt`
+  const auto n_unrolled_loops = t_max / dtt / split_k; // +1?
+  const int32_t tt_low =
+      wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * split_idx;
+  const int32_t tt_high =
+      wavefront_idx * n_loop_unroll + n_unrolled_loops * dtt * (split_idx + 1);
+  const int32_t dtt_tail = wavefronts_per_block * n_loop_unroll_tail;
+  const int32_t tt_tail_low = wavefront_idx * n_loop_unroll_tail +
+      n_unrolled_loops * dtt * (split_idx + 1);
+  const int32_t tt_tail_high = (split_idx == split_k - 1) ? t_max : tt_tail_low;
+
+  for (auto tt = tt_low; tt < tt_high; tt += dtt) {
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                // load the K[b][t][g][h|0][:] row into registers
-                load_v<data_t, data_vec_t>(cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-            }
-        }
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the K[b][t][g][h|0][:] row into registers
+        load_v<data_t, data_vec_t>(
+            cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+      }
+    }
 #pragma unroll n_loop_unroll
-        for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-        {
-            compute_t qk_acc = 0;
-            ck::inner_product<data_vec_t, data_vec_t, compute_t>(q_thread, k_loads[ttt], qk_acc);
-            qk_acc *= qk_scale;
-
-            qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-            max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
-            if(lane_idx == 0)
-            {
-                smem[tt + ttt - n_unrolled_loops * dtt * split_idx] = qk_acc;
-            }
-        }
+    for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+      compute_t qk_acc = 0;
+      ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+          q_thread, k_loads[ttt], qk_acc);
+      qk_acc *= qk_scale;
+
+      qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+      max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
+      if (lane_idx == 0) {
+        smem[tt + ttt - n_unrolled_loops * dtt * split_idx] = qk_acc;
+      }
     }
+  }
 
-    for(auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail)
-    {
-        if(lane_active_for_io)
-        {
+  for (auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail) {
+    if (lane_active_for_io) {
 #pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    // load the K[b][t][g][h|0][:] row into registers
-                    load_v<data_t, data_vec_t>(
-                        cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                }
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the K[b][t][g][h|0][:] row into registers
+          load_v<data_t, data_vec_t>(
+              cache_K_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
         }
+      }
+    }
 #pragma unroll n_loop_unroll_tail
-        for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-        {
-            compute_t qk_acc = 0;
-            const int32_t t  = tt + ttt;
-            if(t < t_max)
-            {
-                ck::inner_product<data_vec_t, data_vec_t, compute_t>(
-                    q_thread, k_loads[ttt], qk_acc);
-                qk_acc *= qk_scale;
-
-                qk_acc     = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
-                max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
-
-                // write accumulated sums to smem.
-                if(lane_idx == 0)
-                {
-                    smem[t - n_unrolled_loops * dtt * split_idx] = qk_acc;
-                }
-            }
+    for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+      compute_t qk_acc = 0;
+      const int32_t t = tt + ttt;
+      if (t < t_max) {
+        ck::inner_product<data_vec_t, data_vec_t, compute_t>(
+            q_thread, k_loads[ttt], qk_acc);
+        qk_acc *= qk_scale;
+
+        qk_acc = wavefrontReduce(qk_acc, [](auto a, auto b) { return a + b; });
+        max_qk_acc = ck::math::max(qk_acc, max_qk_acc);
+
+        // write accumulated sums to smem.
+        if (lane_idx == 0) {
+          smem[t - n_unrolled_loops * dtt * split_idx] = qk_acc;
         }
+      }
     }
+  }
+
+  // Use shared reduction to compute max and compute softmax on shared memory.
+  // write max acc
+  if (lane_idx == 0) {
+    smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+  }
+  __syncthreads();
+  if (lane_idx < wavefronts_per_block) {
+    max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
+  }
+  // shared across all threads in block
+  max_qk_acc =
+      wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
+
+  if (wavefront_idx == 0 && lane_idx == 0) {
+    split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
+  }
+
+  // each wavefront computes partial sum of exp.
+  { // softmax reduce begin
+    compute_t softmax_denominator = 0.0f;
+    const int32_t t_low = n_unrolled_loops * dtt * split_idx;
+    const int32_t t_high = (split_idx + 1 < split_k)
+        ? n_unrolled_loops * dtt * (split_idx + 1)
+        : t_max;
+    for (int32_t t = t_low + thread_linear_idx; t < t_high;
+         t += threads_per_block) {
+      const auto s = ck::math::exp(smem[t - t_low] - max_qk_acc);
+      softmax_denominator += s;
+      smem[t - t_low] = s;
+    }
+    softmax_denominator = wavefrontReduce(
+        softmax_denominator, [](auto a, auto b) { return a + b; });
 
-    // Use shared reduction to compute max and compute softmax on shared memory.
-    // write max acc
-    if(lane_idx == 0)
-    {
-        smem[KV_M_MAX + wavefront_idx] = max_qk_acc;
+    if (lane_idx == 0) {
+      smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
     }
     __syncthreads();
-    if(lane_idx < wavefronts_per_block)
-    {
-        max_qk_acc = ck::math::max(max_qk_acc, smem[KV_M_MAX + lane_idx]);
-    }
-    // shared across all threads in block
-    max_qk_acc = wavefrontReduce(max_qk_acc, [](auto a, auto b) { return a > b ? a : b; });
 
-    if(wavefront_idx == 0 && lane_idx == 0)
-    {
-        split_max[blockIdx.x * split_k + split_idx] = max_qk_acc;
+    // now, compute sum of exp(x - max(x)) over all intermediate results.
+    softmax_denominator = 0.0;
+    if (lane_idx < wavefronts_per_block) {
+      softmax_denominator = smem[KV_M_MAX + lane_idx];
     }
+    softmax_denominator = wavefrontReduce(
+        softmax_denominator, [](auto a, auto b) { return a + b; });
 
-    // each wavefront computes partial sum of exp.
-    { // softmax reduce begin
-        compute_t softmax_denominator = 0.0f;
-        const int32_t t_low           = n_unrolled_loops * dtt * split_idx;
-        const int32_t t_high =
-            (split_idx + 1 < split_k) ? n_unrolled_loops * dtt * (split_idx + 1) : t_max;
-        for(int32_t t = t_low + thread_linear_idx; t < t_high; t += threads_per_block)
-        {
-            const auto s = ck::math::exp(smem[t - t_low] - max_qk_acc);
-            softmax_denominator += s;
-            smem[t - t_low] = s;
-        }
-        softmax_denominator =
-            wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
+    if (wavefront_idx == 0 && lane_idx == 0) {
+      split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
+    }
+  } // softmax reduce end
 
-        if(lane_idx == 0)
-        {
-            smem[KV_M_MAX + wavefront_idx] = softmax_denominator;
-        }
-        __syncthreads();
+  // Split T across wavefronts in a block
+  // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
+  // outputs are of size float[D]
 
-        // now, compute sum of exp(x - max(x)) over all intermediate results.
-        softmax_denominator = 0.0;
-        if(lane_idx < wavefronts_per_block)
-        {
-            softmax_denominator = smem[KV_M_MAX + lane_idx];
-        }
-        softmax_denominator =
-            wavefrontReduce(softmax_denominator, [](auto a, auto b) { return a + b; });
-
-        if(wavefront_idx == 0 && lane_idx == 0)
-        {
-            split_sumexp[blockIdx.x * split_k + split_idx] = softmax_denominator;
-        }
-    } // softmax reduce end
-
-    // Split T across wavefronts in a block
-    // each wavefront compute sum(t_subset) P[t] * V[t_subset, d]
-    // outputs are of size float[D]
-
-    compute_t ps[n_loop_unroll] = {};
-    compute_vec_t o_acc         = 0;
-    if(lane_active_for_io)
-    {
-        for(auto tt = tt_low; tt < tt_high; tt += dtt)
-        {
+  compute_t ps[n_loop_unroll] = {};
+  compute_vec_t o_acc = 0;
+  if (lane_active_for_io) {
+    for (auto tt = tt_low; tt < tt_high; tt += dtt) {
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                // load the V[b][t][g][h|0][:] row into registers, reusing K register
-                // storage
-                load_v<data_t, data_vec_t>(cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        const int32_t t = tt + ttt;
+        // load the V[b][t][g][h|0][:] row into registers, reusing K register
+        // storage
+        load_v<data_t, data_vec_t>(
+            cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+        ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
+      }
 
 #pragma unroll n_loop_unroll
-            for(auto ttt = 0; ttt < n_loop_unroll; ++ttt)
-            {
-                o_acc = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-            }
-        }
+      for (auto ttt = 0; ttt < n_loop_unroll; ++ttt) {
+        o_acc =
+            scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
+      }
+    }
 
-        for(auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail)
-        {
+    for (auto tt = tt_tail_low; tt < tt_tail_high; tt += dtt_tail) {
 #pragma unroll n_loop_unroll_tail
-            for(auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt)
-            {
-                const int32_t t = tt + ttt;
-                if(t < t_max)
-                {
-                    // load the V[b][t][g][h|0][:] row into registers, reusing K register
-                    // storage
-                    load_v<data_t, data_vec_t>(
-                        cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
-                    ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
-                    o_acc   = scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
-                }
-            }
+      for (auto ttt = 0; ttt < n_loop_unroll_tail; ++ttt) {
+        const int32_t t = tt + ttt;
+        if (t < t_max) {
+          // load the V[b][t][g][h|0][:] row into registers, reusing K register
+          // storage
+          load_v<data_t, data_vec_t>(
+              cache_V_base + t * K_stride_m, lane_idx, &k_loads[ttt]);
+          ps[ttt] = smem[t - n_unrolled_loops * dtt * split_idx];
+          o_acc =
+              scalar_scale_acc<data_t, vec_size>(o_acc, k_loads[ttt], ps[ttt]);
         }
+      }
     }
-    __syncthreads();
-
-    // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
-    if(lane_active_for_io)
-    {
-        store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  }
+  __syncthreads();
+
+  // NB: needs sizeof(smem) >= `vec_size` * (sizeof(float)==4) * threadsPerBlock
+  if (lane_active_for_io) {
+    store_v<compute_t, compute_vec_t>(&smem[0], thread_linear_idx, o_acc);
+  }
+
+  __syncthreads();
+  // sum up partial D rows from other wavefronts
+  if (wavefront_idx == 0 && lane_active_for_io) {
+    union {
+      compute_vec_t vec = 0;
+      compute_t arr[vec_size];
+    } r;
+    for (int32_t w = 0; w < wavefronts_per_block; ++w) {
+      compute_vec_t partial_r;
+      load_v<compute_t, compute_vec_t>(
+          smem, w * threads_per_wavefront + lane_idx, &partial_r);
+      r.vec += partial_r;
     }
-
-    __syncthreads();
-    // sum up partial D rows from other wavefronts
-    if(wavefront_idx == 0 && lane_active_for_io)
-    {
-        union
-        {
-            compute_vec_t vec = 0;
-            compute_t arr[vec_size];
-        } r;
-        for(int32_t w = 0; w < wavefronts_per_block; ++w)
-        {
-            compute_vec_t partial_r;
-            load_v<compute_t, compute_vec_t>(
-                smem, w * threads_per_wavefront + lane_idx, &partial_r);
-            r.vec += partial_r;
-        }
-        // elementwise convert from compute_t result to data_t out to be written
-        union
-        {
-            data_vec_t vec;
-            data_t arr[vec_size];
-        } bf_r;
+    // elementwise convert from compute_t result to data_t out to be written
+    union {
+      data_vec_t vec;
+      data_t arr[vec_size];
+    } bf_r;
 #pragma unroll
-        for(int32_t i = 0; i < vec_size; ++i)
-        {
-            bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
-        }
-        // write output row O[b][m][g][h][:]
-        data_t* __restrict__ o_ = O_splits + XQO_base_offset + split_idx * O_stride_split;
-        store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+    for (int32_t i = 0; i < vec_size; ++i) {
+      bf_r.arr[i] = ck::type_convert<data_t>(r.arr[i]);
     }
+    // write output row O[b][m][g][h][:]
+    data_t* __restrict__ o_ =
+        O_splits + XQO_base_offset + split_idx * O_stride_split;
+    store_v<data_t, data_vec_t>(o_, lane_idx, bf_r.vec);
+  }
 }
 
 } // namespace
@@ -482,239 +452,241 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 template <typename scalar_t, typename compute_t = float>
-struct FMHADecoderSplitKDeviceOp : public BaseOperator
-{
-    using DeviceOp = FMHADecoderSplitKDeviceOp;
-    struct Argument : public BaseArgument
-    {
-        const scalar_t* __restrict__ XQ;
-        const scalar_t* __restrict__ cache_K;
-        const scalar_t* __restrict__ cache_V;
-        scalar_t* __restrict__ O;
-        scalar_t* __restrict__ split_O;
-        compute_t* __restrict__ split_max;
-        compute_t* __restrict__ split_sumexp;
-        const int32_t* __restrict__ seq_kv_lens;
-        const ptrdiff_t XQ_stride_b;
-        const ptrdiff_t XQ_stride_m;
-        const ptrdiff_t XQ_stride_g;
-        const ptrdiff_t XQ_stride_h;
-        const ptrdiff_t K_stride_b;
-        const ptrdiff_t K_stride_m;
-        const ptrdiff_t K_stride_g;
-        const ptrdiff_t K_stride_h;
-        const ptrdiff_t O_stride_split;
-        const int32_t Q_size_m;
-        const int32_t Q_size_g;
-        const int32_t Q_size_h;
-        const int32_t Q_size_k;
-        const int32_t K_size_m;
-        const bool multiquery;
-        const float qk_scale;
-        const int32_t split_k;
-
-        const dim3 grid_dim;
-        const dim3 block_dim;
-        const size_t lds_bytes;
-
-        Argument(const scalar_t* __restrict__ XQ,
-                 const scalar_t* __restrict__ cache_K,
-                 const scalar_t* __restrict__ cache_V,
-                 scalar_t* __restrict__ O,
-                 scalar_t* __restrict__ split_O,
-                 compute_t* __restrict__ split_max,
-                 compute_t* __restrict__ split_sumexp,
-                 const int32_t* __restrict__ seq_kv_lens,
-                 const ptrdiff_t XQ_stride_b,
-                 const ptrdiff_t XQ_stride_m,
-                 const ptrdiff_t XQ_stride_g,
-                 const ptrdiff_t XQ_stride_h,
-                 const ptrdiff_t K_stride_b,
-                 const ptrdiff_t K_stride_m,
-                 const ptrdiff_t K_stride_g,
-                 const ptrdiff_t K_stride_h,
-                 const ptrdiff_t O_stride_split,
-                 const int32_t Q_size_m,
-                 const int32_t Q_size_g,
-                 const int32_t Q_size_h,
-                 const int32_t Q_size_k,
-                 const int32_t K_size_m,
-                 const bool multiquery,
-                 const float qk_scale,
-                 const int32_t split_k,
-                 // launch params
-                 const dim3 grid_dim,
-                 const dim3 block_dim,
-                 const size_t lds_bytes)
-            : XQ(XQ),
-              cache_K(cache_K),
-              cache_V(cache_V),
-              O(O),
-              split_O(split_O),
-              split_max(split_max),
-              split_sumexp(split_sumexp),
-              seq_kv_lens(seq_kv_lens),
-              XQ_stride_b(XQ_stride_b),
-              XQ_stride_m(XQ_stride_m),
-              XQ_stride_g(XQ_stride_g),
-              XQ_stride_h(XQ_stride_h),
-              K_stride_b(K_stride_b),
-              K_stride_m(K_stride_m),
-              K_stride_g(K_stride_g),
-              K_stride_h(K_stride_h),
-              O_stride_split(O_stride_split),
-              Q_size_m(Q_size_m),
-              Q_size_g(Q_size_g),
-              Q_size_h(Q_size_h),
-              Q_size_k(Q_size_k),
-              K_size_m(K_size_m),
-              multiquery(multiquery),
-              qk_scale(qk_scale),
-              split_k(split_k),
-              // launch params
-              grid_dim(grid_dim),
-              block_dim(block_dim),
-              lds_bytes(lds_bytes)
-        {
-        }
-
-        std::string str() const
-        {
-            std::ostringstream oss;
-            oss << "Argument { " << std::endl
-                << "    XQ: " << XQ << std::endl
-                << "    cache_K: " << cache_K << std::endl
-                << "    cache_V: " << cache_V << std::endl
-                << "    O: " << O << std::endl
-                << "    split_O: " << split_O << std::endl
-                << "    split_max: " << split_max << std::endl
-                << "    split_sumexp: " << split_sumexp << std::endl
-                << "    seq_kv_lens: " << seq_kv_lens << std::endl
-                << "    XQ_stride_b: " << XQ_stride_b << std::endl
-                << "    XQ_stride_m: " << XQ_stride_m << std::endl
-                << "    XQ_stride_g: " << XQ_stride_g << std::endl
-                << "    XQ_stride_h: " << XQ_stride_h << std::endl
-                << "    K_stride_b: " << K_stride_b << std::endl
-                << "    K_stride_m: " << K_stride_m << std::endl
-                << "    K_stride_g: " << K_stride_g << std::endl
-                << "    K_stride_h: " << K_stride_h << std::endl
-                << "    O_stride_split: " << O_stride_split << std::endl
-                << "    Q_size_m: " << Q_size_m << std::endl
-                << "    Q_size_g: " << Q_size_g << std::endl
-                << "    Q_size_h: " << Q_size_h << std::endl
-                << "    Q_size_k: " << Q_size_k << std::endl
-                << "    K_size_m: " << K_size_m << std::endl
-                << "    multiquery: " << multiquery << std::endl
-                << "    qk_scale: " << qk_scale << std::endl
-                << "    split_k: " << split_k << std::endl
-                << std::endl
-                << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "." << grid_dim.z
-                << std::endl
-                << "    block_dim: " << block_dim.x << "." << block_dim.y << "." << block_dim.z
-                << std::endl
-                << "    lds_bytes: " << lds_bytes << std::endl
-                << "}";
-            return oss.str();
-        }
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            auto threads_per_wavefront        = arg.block_dim.x;
-            auto Q_size_k_alignment_necessary = 0;
-
-            for(auto vec_size : {4, 2, 1})
-            {
-                if(arg.Q_size_k <= vec_size * threads_per_wavefront)
-                {
-                    Q_size_k_alignment_necessary = vec_size;
-                }
-            }
-
-            if(!Q_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported Q_size_k");
-            }
-
-            if(arg.Q_size_k % Q_size_k_alignment_necessary)
-            {
-                throw std::runtime_error("Unsupported alignment for Q_size_k");
-            }
-
-            float split_attention_result = launch_and_time_kernel(
-                stream_config,
-                Q_size_k_alignment_necessary == 4
-                    ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 4>
-                    : Q_size_k_alignment_necessary == 2
-                          ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 2>
-                          : Q_size_k_alignment_necessary == 1
-                                ? efficient_attention_forward_decoder_splitk_ck_kernel<scalar_t, 1>
-                                : nullptr,
-                arg.grid_dim,
-                arg.block_dim,
-                arg.lds_bytes,
-                arg.XQ,
-                arg.cache_K,
-                arg.cache_V,
-                arg.split_O,
-                arg.split_max,
-                arg.split_sumexp,
-                arg.seq_kv_lens,
-                arg.XQ_stride_b,
-                arg.XQ_stride_m,
-                arg.XQ_stride_g,
-                arg.XQ_stride_h,
-                arg.K_stride_b,
-                arg.K_stride_m,
-                arg.K_stride_g,
-                arg.K_stride_h,
-                arg.O_stride_split,
-                arg.Q_size_m,
-                arg.Q_size_g,
-                arg.Q_size_h,
-                arg.Q_size_k,
-                arg.K_size_m,
-                arg.multiquery,
-                arg.qk_scale,
-                arg.split_k);
-
-            const dim3 reduce_gridsize         = {arg.grid_dim.x};
-            const dim3 reduce_blocksize        = {arg.block_dim.x};
-            constexpr int32_t reduce_lds_bytes = 0;
-            float reduce_result                = launch_and_time_kernel(
-                stream_config,
-                Q_size_k_alignment_necessary == 4
-                    ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 4>
-                    : Q_size_k_alignment_necessary == 2
-                          ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<scalar_t, 2>
-                          : Q_size_k_alignment_necessary == 1
-                                ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                                      scalar_t,
-                                      1>
-                                : nullptr,
-                reduce_gridsize,
-                reduce_blocksize,
-                reduce_lds_bytes,
-                arg.split_O,
-                arg.split_max,
-                arg.split_sumexp,
-                arg.O,
-                arg.Q_size_m,
-                arg.Q_size_g,
-                arg.Q_size_h,
-                arg.Q_size_k,
-                arg.O_stride_split,
-                arg.XQ_stride_b,
-                arg.XQ_stride_m,
-                arg.XQ_stride_g,
-                arg.XQ_stride_h,
-                arg.split_k);
-            return split_attention_result + reduce_result;
+struct FMHADecoderSplitKDeviceOp : public BaseOperator {
+  using DeviceOp = FMHADecoderSplitKDeviceOp;
+  struct Argument : public BaseArgument {
+    const scalar_t* __restrict__ XQ;
+    const scalar_t* __restrict__ cache_K;
+    const scalar_t* __restrict__ cache_V;
+    scalar_t* __restrict__ O;
+    scalar_t* __restrict__ split_O;
+    compute_t* __restrict__ split_max;
+    compute_t* __restrict__ split_sumexp;
+    const int32_t* __restrict__ seq_kv_lens;
+    const ptrdiff_t XQ_stride_b;
+    const ptrdiff_t XQ_stride_m;
+    const ptrdiff_t XQ_stride_g;
+    const ptrdiff_t XQ_stride_h;
+    const ptrdiff_t K_stride_b;
+    const ptrdiff_t K_stride_m;
+    const ptrdiff_t K_stride_g;
+    const ptrdiff_t K_stride_h;
+    const ptrdiff_t O_stride_split;
+    const int32_t Q_size_m;
+    const int32_t Q_size_g;
+    const int32_t Q_size_h;
+    const int32_t Q_size_k;
+    const int32_t K_size_m;
+    const bool multiquery;
+    const float qk_scale;
+    const int32_t split_k;
+
+    const dim3 grid_dim;
+    const dim3 block_dim;
+    const size_t lds_bytes;
+
+    Argument(
+        const scalar_t* __restrict__ XQ,
+        const scalar_t* __restrict__ cache_K,
+        const scalar_t* __restrict__ cache_V,
+        scalar_t* __restrict__ O,
+        scalar_t* __restrict__ split_O,
+        compute_t* __restrict__ split_max,
+        compute_t* __restrict__ split_sumexp,
+        const int32_t* __restrict__ seq_kv_lens,
+        const ptrdiff_t XQ_stride_b,
+        const ptrdiff_t XQ_stride_m,
+        const ptrdiff_t XQ_stride_g,
+        const ptrdiff_t XQ_stride_h,
+        const ptrdiff_t K_stride_b,
+        const ptrdiff_t K_stride_m,
+        const ptrdiff_t K_stride_g,
+        const ptrdiff_t K_stride_h,
+        const ptrdiff_t O_stride_split,
+        const int32_t Q_size_m,
+        const int32_t Q_size_g,
+        const int32_t Q_size_h,
+        const int32_t Q_size_k,
+        const int32_t K_size_m,
+        const bool multiquery,
+        const float qk_scale,
+        const int32_t split_k,
+        // launch params
+        const dim3 grid_dim,
+        const dim3 block_dim,
+        const size_t lds_bytes)
+        : XQ(XQ),
+          cache_K(cache_K),
+          cache_V(cache_V),
+          O(O),
+          split_O(split_O),
+          split_max(split_max),
+          split_sumexp(split_sumexp),
+          seq_kv_lens(seq_kv_lens),
+          XQ_stride_b(XQ_stride_b),
+          XQ_stride_m(XQ_stride_m),
+          XQ_stride_g(XQ_stride_g),
+          XQ_stride_h(XQ_stride_h),
+          K_stride_b(K_stride_b),
+          K_stride_m(K_stride_m),
+          K_stride_g(K_stride_g),
+          K_stride_h(K_stride_h),
+          O_stride_split(O_stride_split),
+          Q_size_m(Q_size_m),
+          Q_size_g(Q_size_g),
+          Q_size_h(Q_size_h),
+          Q_size_k(Q_size_k),
+          K_size_m(K_size_m),
+          multiquery(multiquery),
+          qk_scale(qk_scale),
+          split_k(split_k),
+          // launch params
+          grid_dim(grid_dim),
+          block_dim(block_dim),
+          lds_bytes(lds_bytes) {}
+
+    std::string str() const {
+      std::ostringstream oss;
+      oss << "Argument { " << std::endl
+          << "    XQ: " << XQ << std::endl
+          << "    cache_K: " << cache_K << std::endl
+          << "    cache_V: " << cache_V << std::endl
+          << "    O: " << O << std::endl
+          << "    split_O: " << split_O << std::endl
+          << "    split_max: " << split_max << std::endl
+          << "    split_sumexp: " << split_sumexp << std::endl
+          << "    seq_kv_lens: " << seq_kv_lens << std::endl
+          << "    XQ_stride_b: " << XQ_stride_b << std::endl
+          << "    XQ_stride_m: " << XQ_stride_m << std::endl
+          << "    XQ_stride_g: " << XQ_stride_g << std::endl
+          << "    XQ_stride_h: " << XQ_stride_h << std::endl
+          << "    K_stride_b: " << K_stride_b << std::endl
+          << "    K_stride_m: " << K_stride_m << std::endl
+          << "    K_stride_g: " << K_stride_g << std::endl
+          << "    K_stride_h: " << K_stride_h << std::endl
+          << "    O_stride_split: " << O_stride_split << std::endl
+          << "    Q_size_m: " << Q_size_m << std::endl
+          << "    Q_size_g: " << Q_size_g << std::endl
+          << "    Q_size_h: " << Q_size_h << std::endl
+          << "    Q_size_k: " << Q_size_k << std::endl
+          << "    K_size_m: " << K_size_m << std::endl
+          << "    multiquery: " << multiquery << std::endl
+          << "    qk_scale: " << qk_scale << std::endl
+          << "    split_k: " << split_k << std::endl
+          << std::endl
+          << "    grid_dim: " << grid_dim.x << "." << grid_dim.y << "."
+          << grid_dim.z << std::endl
+          << "    block_dim: " << block_dim.x << "." << block_dim.y << "."
+          << block_dim.z << std::endl
+          << "    lds_bytes: " << lds_bytes << std::endl
+          << "}";
+      return oss.str();
+    }
+  };
+
+  struct Invoker : public BaseInvoker {
+    using Argument = DeviceOp::Argument;
+    float Run(
+        const Argument& arg,
+        const StreamConfig& stream_config = StreamConfig{}) {
+      auto threads_per_wavefront = arg.block_dim.x;
+      auto Q_size_k_alignment_necessary = 0;
+
+      for (auto vec_size : {4, 2, 1}) {
+        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+          Q_size_k_alignment_necessary = vec_size;
         }
-    };
+      }
+
+      if (!Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported Q_size_k");
+      }
+
+      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+        throw std::runtime_error("Unsupported alignment for Q_size_k");
+      }
+
+      float split_attention_result = launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    4>
+              : Q_size_k_alignment_necessary == 2
+                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                        scalar_t,
+                        2>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
+          arg.grid_dim,
+          arg.block_dim,
+          arg.lds_bytes,
+          arg.XQ,
+          arg.cache_K,
+          arg.cache_V,
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.seq_kv_lens,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.K_stride_b,
+          arg.K_stride_m,
+          arg.K_stride_g,
+          arg.K_stride_h,
+          arg.O_stride_split,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.K_size_m,
+          arg.multiquery,
+          arg.qk_scale,
+          arg.split_k);
+
+      const dim3 reduce_gridsize = {arg.grid_dim.x};
+      const dim3 reduce_blocksize = {arg.block_dim.x};
+      constexpr int32_t reduce_lds_bytes = 0;
+      float reduce_result = launch_and_time_kernel(
+          stream_config,
+          Q_size_k_alignment_necessary == 4
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    4>
+              : Q_size_k_alignment_necessary == 2
+                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                        scalar_t,
+                        2>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
+          reduce_gridsize,
+          reduce_blocksize,
+          reduce_lds_bytes,
+          arg.split_O,
+          arg.split_max,
+          arg.split_sumexp,
+          arg.O,
+          arg.Q_size_m,
+          arg.Q_size_g,
+          arg.Q_size_h,
+          arg.Q_size_k,
+          arg.O_stride_split,
+          arg.XQ_stride_b,
+          arg.XQ_stride_m,
+          arg.XQ_stride_g,
+          arg.XQ_stride_h,
+          arg.split_k);
+      return split_attention_result + reduce_result;
+    }
+  };
 };
 } // namespace device
 } // namespace tensor_operation
diff --git a/xformers/csrc/attention/hip_fmha/ck_bool_switch.h b/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
index 4b92dd95a..1a062d3e3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
@@ -6,30 +6,24 @@
  */
 #pragma once
 
-#define BOOL_SWITCH_1(COND1, CONST_NAME1, ...)  \
-    [&] {                                       \
-        if(COND1)                               \
-        {                                       \
-            constexpr bool CONST_NAME1 = true;  \
-            __VA_ARGS__();                      \
-        }                                       \
-        else                                    \
-        {                                       \
-            constexpr bool CONST_NAME1 = false; \
-            __VA_ARGS__();                      \
-        }                                       \
-    }()
+#define BOOL_SWITCH_1(COND1, CONST_NAME1, ...) \
+  [&] {                                        \
+    if (COND1) {                               \
+      constexpr bool CONST_NAME1 = true;       \
+      __VA_ARGS__();                           \
+    } else {                                   \
+      constexpr bool CONST_NAME1 = false;      \
+      __VA_ARGS__();                           \
+    }                                          \
+  }()
 
 #define BOOL_SWITCH_2(COND1, CONST_NAME1, COND2, CONST_NAME2, ...) \
-    [&] {                                                          \
-        if(COND1)                                                  \
-        {                                                          \
-            constexpr bool CONST_NAME1 = true;                     \
-            BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);      \
-        }                                                          \
-        else                                                       \
-        {                                                          \
-            constexpr bool CONST_NAME1 = false;                    \
-            BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);      \
-        }                                                          \
-    }()
+  [&] {                                                            \
+    if (COND1) {                                                   \
+      constexpr bool CONST_NAME1 = true;                           \
+      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);            \
+    } else {                                                       \
+      constexpr bool CONST_NAME1 = false;                          \
+      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);            \
+    }                                                              \
+  }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
index b7de4dbf8..49122fd74 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
@@ -11,190 +11,186 @@
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedBackward_V1
-{
-    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-    static constexpr ck::index_t BlockSize             = 256;
-    static constexpr ck::index_t MPerBlock             = 128;
-    static constexpr ck::index_t NPerBlock             = 128;
-    // static constexpr ck::index_t KPerBlock;
-    // static constexpr ck::index_t Gemm1NPerBlock;
-    static constexpr ck::index_t Gemm1KPerBlock = 32;
-    static constexpr ck::index_t Gemm2KPerBlock = 32;
-    static constexpr ck::index_t AK1            = 8;
-    static constexpr ck::index_t BK1            = 8;
-    static constexpr ck::index_t B1K1           = 2;
-    static constexpr ck::index_t MPerXDL        = 32;
-    static constexpr ck::index_t NPerXDL        = 32;
-    static constexpr ck::index_t MXdlPerWave    = 4;
-    static constexpr ck::index_t NXdlPerWave    = 1;
-    // static constexpr ck::index_t Gemm1NXdlPerWave;
-    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
-    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
-    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
-    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
-    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-    static constexpr bool ABlockLdsExtraM                             = true;
-    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
-    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
-    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
-    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
-    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-    static constexpr bool BBlockLdsExtraN                             = true;
-    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-    // using
-    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-    // static constexpr ck::index_t
-    // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsBatchedBackward_V1 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  // static constexpr ck::index_t KPerBlock;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 4;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedBackward_V2
-{
-    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-    static constexpr ck::index_t BlockSize             = 256;
-    static constexpr ck::index_t MPerBlock             = 64;
-    static constexpr ck::index_t NPerBlock             = 128;
-    static constexpr ck::index_t KPerBlock             = 128;
-    // static constexpr ck::index_t Gemm1NPerBlock;
-    static constexpr ck::index_t Gemm1KPerBlock = 32;
-    static constexpr ck::index_t Gemm2KPerBlock = 64;
-    static constexpr ck::index_t AK1            = 8;
-    static constexpr ck::index_t BK1            = 8;
-    static constexpr ck::index_t B1K1           = 2;
-    static constexpr ck::index_t MPerXDL        = 32;
-    static constexpr ck::index_t NPerXDL        = 32;
-    static constexpr ck::index_t MXdlPerWave    = 2;
-    static constexpr ck::index_t NXdlPerWave    = 1;
-    // static constexpr ck::index_t Gemm1NXdlPerWave;
-    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
-    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
-    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
-    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
-    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-    static constexpr bool ABlockLdsExtraM                             = true;
-    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
-    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
-    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
-    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
-    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-    static constexpr bool BBlockLdsExtraN                             = true;
-    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-    using B1BlockTransferThreadClusterLengths_BK0_N_BK1      = S<8, 32, 1>;
-    using B1BlockTransferThreadClusterArrangeOrder           = S<0, 2, 1>;
-    using B1BlockTransferSrcAccessOrder                      = S<0, 2, 1>;
-    static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-    // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-    static constexpr bool B1BlockLdsExtraN                             = false;
-    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle         = 1;
-    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-    // using
-    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-    // static constexpr ck::index_t
-    // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsBatchedBackward_V2 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 64;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 128;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 64;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 2;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedBackward_V1
-{
-    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-    static constexpr ck::index_t BlockSize             = 256;
-    static constexpr ck::index_t MPerBlock             = 128;
-    static constexpr ck::index_t NPerBlock             = 128;
-    // static constexpr ck::index_t KPerBlock;
-    // static constexpr ck::index_t Gemm1NPerBlock;
-    static constexpr ck::index_t Gemm1KPerBlock = 32;
-    static constexpr ck::index_t Gemm2KPerBlock = 32;
-    static constexpr ck::index_t AK1            = 8;
-    static constexpr ck::index_t BK1            = 8;
-    static constexpr ck::index_t B1K1           = 2;
-    static constexpr ck::index_t MPerXDL        = 32;
-    static constexpr ck::index_t NPerXDL        = 32;
-    static constexpr ck::index_t MXdlPerWave    = 4;
-    static constexpr ck::index_t NXdlPerWave    = 1;
-    // static constexpr ck::index_t Gemm1NXdlPerWave;
-    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
-    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
-    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
-    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
-    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-    static constexpr bool ABlockLdsExtraM                             = true;
-    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
-    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
-    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
-    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
-    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-    static constexpr bool BBlockLdsExtraN                             = true;
-    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-    // using
-    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-    // static constexpr ck::index_t
-    // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsGroupedBackward_V1 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 128;
+  static constexpr ck::index_t NPerBlock = 128;
+  // static constexpr ck::index_t KPerBlock;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 32;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 4;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
 
 // list the template parameters that will not be tuned,
 // the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedBackward_V2
-{
-    static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-    static constexpr ck::index_t BlockSize             = 256;
-    static constexpr ck::index_t MPerBlock             = 64;
-    static constexpr ck::index_t NPerBlock             = 128;
-    static constexpr ck::index_t KPerBlock             = 128;
-    // static constexpr ck::index_t Gemm1NPerBlock;
-    static constexpr ck::index_t Gemm1KPerBlock = 32;
-    static constexpr ck::index_t Gemm2KPerBlock = 64;
-    static constexpr ck::index_t AK1            = 8;
-    static constexpr ck::index_t BK1            = 8;
-    static constexpr ck::index_t B1K1           = 2;
-    static constexpr ck::index_t MPerXDL        = 32;
-    static constexpr ck::index_t NPerXDL        = 32;
-    static constexpr ck::index_t MXdlPerWave    = 2;
-    static constexpr ck::index_t NXdlPerWave    = 1;
-    // static constexpr ck::index_t Gemm1NXdlPerWave;
-    static constexpr ck::index_t Gemm2NXdlPerWave           = 1;
-    using ABlockTransferThreadClusterLengths_AK0_M_AK1      = S<4, 64, 1>;
-    using ABlockTransferThreadClusterArrangeOrder           = S<1, 0, 2>;
-    using ABlockTransferSrcAccessOrder                      = S<1, 0, 2>;
-    static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-    // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-    static constexpr bool ABlockLdsExtraM                             = true;
-    using BBlockTransferThreadClusterLengths_BK0_N_BK1                = S<4, 64, 1>;
-    using BBlockTransferThreadClusterArrangeOrder                     = S<1, 0, 2>;
-    using BBlockTransferSrcAccessOrder                                = S<1, 0, 2>;
-    static constexpr ck::index_t BBlockTransferSrcVectorDim           = 2;
-    // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-    static constexpr bool BBlockLdsExtraN                             = true;
-    // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-    using B1BlockTransferThreadClusterLengths_BK0_N_BK1      = S<8, 32, 1>;
-    using B1BlockTransferThreadClusterArrangeOrder           = S<0, 2, 1>;
-    using B1BlockTransferSrcAccessOrder                      = S<0, 2, 1>;
-    static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-    // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-    static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-    static constexpr bool B1BlockLdsExtraN                             = false;
-    static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle         = 1;
-    // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-    // using
-    // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-    // static constexpr ck::index_t
-    // CShuffleBlockTransferScalarPerVector_NPerBlock;
+struct GemmOpConstantsGroupedBackward_V2 {
+  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
+  static constexpr ck::index_t BlockSize = 256;
+  static constexpr ck::index_t MPerBlock = 64;
+  static constexpr ck::index_t NPerBlock = 128;
+  static constexpr ck::index_t KPerBlock = 128;
+  // static constexpr ck::index_t Gemm1NPerBlock;
+  static constexpr ck::index_t Gemm1KPerBlock = 32;
+  static constexpr ck::index_t Gemm2KPerBlock = 64;
+  static constexpr ck::index_t AK1 = 8;
+  static constexpr ck::index_t BK1 = 8;
+  static constexpr ck::index_t B1K1 = 2;
+  static constexpr ck::index_t MPerXDL = 32;
+  static constexpr ck::index_t NPerXDL = 32;
+  static constexpr ck::index_t MXdlPerWave = 2;
+  static constexpr ck::index_t NXdlPerWave = 1;
+  // static constexpr ck::index_t Gemm1NXdlPerWave;
+  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
+  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
+  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
+  static constexpr bool ABlockLdsExtraM = true;
+  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
+  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
+  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
+  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
+  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
+  static constexpr bool BBlockLdsExtraN = true;
+  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
+  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
+  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
+  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
+  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
+  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
+  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
+  static constexpr bool B1BlockLdsExtraN = false;
+  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
+  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
+  // using
+  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+  // static constexpr ck::index_t
+  // CShuffleBlockTransferScalarPerVector_NPerBlock;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
index 3c5fdffc2..d0cccf2b3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
@@ -22,56 +22,60 @@
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
-struct batched_backward_masktype_attnbias_dispatched
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using Scale       = ck::tensor_operation::element_wise::Scale;
-
-    using QKVElementOp = PassThrough;
-    using YElementOp   = PassThrough;
-
-    using InputDataType    = scalar_t;
-    using OutputDataType   = typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
-    using GemmDataType     = scalar_t;
-    using AccDataType      = F32;
-    using ShuffleDataType  = F32;
-    using LSEDataType      = F32;
-    using ZDataType        = unsigned short;
-    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
-    using Acc1BiasDataType = void;
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-    static constexpr auto MaskingSpec =
-        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
-
-    static constexpr bool Deterministic = true;
-
-    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
+struct batched_backward_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+  using Scale = ck::tensor_operation::element_wise::Scale;
+
+  using QKVElementOp = PassThrough;
+  using YElementOp = PassThrough;
+
+  using InputDataType = scalar_t;
+  using OutputDataType =
+      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
+  using GemmDataType = scalar_t;
+  using AccDataType = F32;
+  using ShuffleDataType = F32;
+  using LSEDataType = F32;
+  using ZDataType = unsigned short;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr bool Deterministic = true;
+
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_BACKWARD_V1_HEADDIM_SWITCH
-#define BATCHED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)              \
-    [&] {                                                                          \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                                     \
-        {                                                                          \
-            constexpr ck::index_t kGemm1NPerBlock                = 32;             \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 1;              \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;              \
-            using kCShuffleBlockTransferClusterLengths           = S<1, 64, 1, 4>; \
-            __VA_ARGS__();                                                         \
-        }                                                                          \
-        else                                                                       \
-        {                                                                          \
-            constexpr ck::index_t kGemm1NPerBlock                = 64;             \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 2;              \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;              \
-            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>; \
-            __VA_ARGS__();                                                         \
-        };                                                                         \
-    }()
+#define BATCHED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                               \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
+      __VA_ARGS__();                                                  \
+    } else {                                                          \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
+      __VA_ARGS__();                                                  \
+    };                                                                \
+  }()
 #endif
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -142,9 +146,9 @@ struct batched_backward_masktype_attnbias_dispatched
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-    // clang-format on
+  // clang-format on
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -223,276 +227,299 @@ struct batched_backward_masktype_attnbias_dispatched
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-    // clang-format on
-
-    static constexpr auto I1 = ck::Number<1>{};
-    static constexpr auto I2 = ck::Number<2>{};
-    static constexpr auto I3 = ck::Number<3>{};
-
-    static void Run(BatchedBackwardParams& param, hipStream_t stream)
-    {
-        using ck::math::min;
-
-        if(param.K <= 64 && param.Kv <= 64)
-        {
-            // compile-time constants which don't depend on head-dim switching
-            constexpr ck::index_t thread_slice_length_ak1 =
-                GemmOpConstantsBatchedBackward_V1::AK1 /
-                GemmOpConstantsBatchedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
-                    I2);
-            constexpr ck::index_t thread_slice_length_bk1 =
-                GemmOpConstantsBatchedBackward_V1::BK1 /
-                GemmOpConstantsBatchedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
-                    I2);
-
-            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-                          "and ThreadClusterLengths!");
-
-            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_ak1);
-
-            BATCHED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-                constexpr ck::index_t thread_slice_length_cshuflle_n =
-                    (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                    kCShuffleBlockTransferClusterLengths::At(I3);
-
-                constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                    min(2, thread_slice_length_cshuflle_n);
-
-                ALIGN_SWITCH_2(
-                    kABBlockTransferSrcScalarPerVector_max,
-                    kABBlockTransferSrcScalarPerVector,
-                    param.K,
-                    kCShuffleBlockTransferScalarPerVector_max,
-                    kCShuffleBlockTransferScalarPerVector,
-                    param.Kv,
-                    [&] {
-                        using DeviceOpInstance =
-                            DeviceOpInstanceTemp_V1<kGemm1NPerBlock,
-                                                    kGemm1NXdlPerWave,
-                                                    kCShuffleNXdlPerWavePerShuffle,
-                                                    kCShuffleBlockTransferClusterLengths,
-                                                    kABBlockTransferSrcScalarPerVector,
-                                                    kCShuffleBlockTransferScalarPerVector>;
-
-                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                    });
+  // clang-format on
+
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
+  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    if (param.K <= 64 && param.Kv <= 64) {
+      // compile-time constants which don't depend on head-dim switching
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedBackward_V1::AK1 /
+          GemmOpConstantsBatchedBackward_V1::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedBackward_V1::BK1 /
+          GemmOpConstantsBatchedBackward_V1::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+          "and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      BATCHED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+        constexpr ck::index_t thread_slice_length_cshuflle_n =
+            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+             kGemm1NXdlPerWave) /
+            kCShuffleBlockTransferClusterLengths::At(I3);
+
+        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+            min(2, thread_slice_length_cshuflle_n);
+
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
             });
-        }
-        else
-        {
-            constexpr ck::index_t kGemm1NPerBlock                = 128;
-            constexpr ck::index_t kGemm1NXdlPerWave              = 4;
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>;
-
-            constexpr ck::index_t thread_slice_length_ak1 =
-                GemmOpConstantsBatchedBackward_V2::AK1 /
-                GemmOpConstantsBatchedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
-                    I2);
-            constexpr ck::index_t thread_slice_length_bk1 =
-                GemmOpConstantsBatchedBackward_V2::BK1 /
-                GemmOpConstantsBatchedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
-                    I2);
-
-            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-                          "and ThreadClusterLengths!");
-
-            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_ak1);
-
-            constexpr ck::index_t thread_slice_length_gemm1n =
-                kGemm1NPerBlock / GemmOpConstantsBatchedBackward_V2::
-                                      B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_gemm1n);
-
-            constexpr ck::index_t thread_slice_length_cshuflle_n =
-                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                kCShuffleBlockTransferClusterLengths::At(I3);
-
-            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                min(2, thread_slice_length_cshuflle_n);
-
-            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
-                         kCShuffleBlockTransferScalarPerVector_max)
-            {
-                ALIGN_SWITCH_2(
-                    kABBlockTransferSrcScalarPerVector_max,
-                    kABBlockTransferSrcScalarPerVector,
-                    param.K,
-                    kB1BlockTransferSrcScalarPerVector_max,
-                    kB1BlockTransferSrcScalarPerVector,
-                    param.Kv,
-                    [&] {
-                        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                            min(kB1BlockTransferSrcScalarPerVector,
-                                kCShuffleBlockTransferScalarPerVector_max);
-
-                        static_assert(kB1BlockTransferSrcScalarPerVector > 0,
-                                      "kB1BlockTransferSrcScalarPerVector must be positive");
-
-                        using DeviceOpInstance =
-                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
-                                                    kGemm1NXdlPerWave,
-                                                    kCShuffleNXdlPerWavePerShuffle,
-                                                    kCShuffleBlockTransferClusterLengths,
-                                                    kABBlockTransferSrcScalarPerVector,
-                                                    kB1BlockTransferSrcScalarPerVector,
-                                                    kCShuffleBlockTransferScalarPerVector>;
-
-                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                    });
-            }
-            else
-            {
-                ALIGN_SWITCH_2(
-                    kABBlockTransferSrcScalarPerVector_max,
-                    kABBlockTransferSrcScalarPerVector,
-                    param.K,
-                    kCShuffleBlockTransferScalarPerVector_max,
-                    kCShuffleBlockTransferScalarPerVector,
-                    param.Kv,
-                    [&] {
-                        constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                            min(kCShuffleBlockTransferScalarPerVector,
-                                kB1BlockTransferSrcScalarPerVector_max);
-
-                        static_assert(kB1BlockTransferSrcScalarPerVector > 0,
-                                      "kB1BlockTransferSrcScalarPerVector must be positive");
-
-                        using DeviceOpInstance =
-                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
-                                                    kGemm1NXdlPerWave,
-                                                    kCShuffleNXdlPerWavePerShuffle,
-                                                    kCShuffleBlockTransferClusterLengths,
-                                                    kABBlockTransferSrcScalarPerVector,
-                                                    kB1BlockTransferSrcScalarPerVector,
-                                                    kCShuffleBlockTransferScalarPerVector>;
-
-                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                    });
-            };
-        };
+      });
+    } else {
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsBatchedBackward_V2::AK1 /
+          GemmOpConstantsBatchedBackward_V2::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsBatchedBackward_V2::BK1 /
+          GemmOpConstantsBatchedBackward_V2::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+          "and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsBatchedBackward_V2::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          kCShuffleBlockTransferClusterLengths::At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(2, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+
+              static_assert(
+                  kB1BlockTransferSrcScalarPerVector > 0,
+                  "kB1BlockTransferSrcScalarPerVector must be positive");
+
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+
+              static_assert(
+                  kB1BlockTransferSrcScalarPerVector > 0,
+                  "kB1BlockTransferSrcScalarPerVector must be positive");
+
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
     };
-
-    template <typename DeviceOpInstance>
-    static void RunWithDeviceOp(BatchedBackwardParams& param, hipStream_t stream)
-    {
-        std::vector<ck::index_t> q_gs_ms_ks_lengths{param.B, param.Hq, param.M, param.K};
-        std::vector<ck::index_t> q_gs_ms_ks_strides{
-            param.q_strides[0], param.q_strides[2], param.q_strides[1], param.q_strides[3]};
-
-        std::vector<ck::index_t> k_gs_ns_ks_lengths{param.B, param.Hkv, param.N, param.K};
-        std::vector<ck::index_t> k_gs_ns_ks_strides{
-            param.k_strides[0], param.k_strides[2], param.k_strides[1], param.k_strides[3]};
-
-        std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {param.B, param.Hq, param.N, param.K};
-        std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {param.tmp_grad_k_strides[0],
-                                                           param.tmp_grad_k_strides[2],
-                                                           param.tmp_grad_k_strides[1],
-                                                           param.tmp_grad_k_strides[3]};
-
-        std::vector<ck::index_t> v_gs_os_ns_lengths{param.B, param.Hkv, param.Kv, param.N};
-        std::vector<ck::index_t> v_gs_os_ns_strides{
-            param.v_strides[0], param.v_strides[2], param.v_strides[3], param.v_strides[1]};
-
-        std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {param.B, param.Hq, param.Kv, param.N};
-        std::vector<ck::index_t> vgrad_gs_os_ns_strides = {param.tmp_grad_v_strides[0],
-                                                           param.tmp_grad_v_strides[2],
-                                                           param.tmp_grad_v_strides[3],
-                                                           param.tmp_grad_v_strides[1]};
-
-        std::vector<ck::index_t> y_gs_ms_os_lengths{param.B, param.Hq, param.M, param.Kv};
-        std::vector<ck::index_t> y_gs_ms_os_strides{
-            param.out_strides[0], param.out_strides[2], param.out_strides[1], param.out_strides[3]};
-
-        std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-        std::vector<ck::index_t> d_gs_ms_ns_lengths;
-        std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-        if constexpr(has_attn_bias)
-        {
-            d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-            d_gs_ms_ns_strides = {param.attn_bias_strides[0],
-                                  param.attn_bias_strides[1],
-                                  param.attn_bias_strides[2],
-                                  param.attn_bias_strides[3]};
-        }
-        else
-        {
-            d_gs_ms_ns_lengths = {1, 1, 1, 1};
-            d_gs_ms_ns_strides = {0, 0, 0, 0};
-        };
-
-        float alpha = param.scale;
-
-        auto op      = DeviceOpInstance{};
-        auto invoker = op.MakeInvoker();
-
-        auto arg_ptr = op.MakeArgumentPointer(
-            param.q_ptr,
-            param.k_ptr,
-            nullptr, // p_z_grid
-            param.v_ptr,
-            param.out_ptr,
-            param.logsumexp_ptr,
-            param.grad_out_ptr,
-            param.grad_q_ptr,
-            param.grad_k_ptr,
-            param.grad_v_ptr,
-            param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-            nullptr, // p_acc1_bias
-            param.bias_has_grad ? param.grad_bias_ptr : nullptr,
-            nullptr,
-            q_gs_ms_ks_lengths, // q, dQ should have same shape
-            q_gs_ms_ks_strides,
-            k_gs_ns_ks_lengths, // k, dK should have same shape
-            k_gs_ns_ks_strides,
-            {1, 1, 1, 1},       // z_gs_ms_ns_lengths
-            {0, 0, 0, 0},       // z_gs_ms_ns_strides
-            v_gs_os_ns_lengths, // v, dV should have same shape
-            v_gs_os_ns_strides,
-            y_gs_ms_os_lengths, // y, dY should have same shape
-            y_gs_ms_os_strides,
-            lse_gs_ms_lengths,
-            param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
-            param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
-            param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
-            param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
-            d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-            d_gs_ms_ns_strides,
-            {}, // acc1_biases_gs_ms_os_lengths
-            {}, // acc1_biases_gs_ms_os_strides
-            QKVElementOp{},
-            QKVElementOp{},
-            Scale{alpha},
-            QKVElementOp{},
-            YElementOp{},
-            param.dropout_prob,
-            std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-        if(!op.IsSupportedArgument(arg_ptr.get()))
-        {
-            std::ostringstream ostr;
-            ostr << op.GetTypeString() << " does not support this problem";
-
-            throw std::runtime_error(ostr.str());
-        }
-
-        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(
+      BatchedBackwardParams& param,
+      hipStream_t stream) {
+    std::vector<ck::index_t> q_gs_ms_ks_lengths{
+        param.B, param.Hq, param.M, param.K};
+    std::vector<ck::index_t> q_gs_ms_ks_strides{
+        param.q_strides[0],
+        param.q_strides[2],
+        param.q_strides[1],
+        param.q_strides[3]};
+
+    std::vector<ck::index_t> k_gs_ns_ks_lengths{
+        param.B, param.Hkv, param.N, param.K};
+    std::vector<ck::index_t> k_gs_ns_ks_strides{
+        param.k_strides[0],
+        param.k_strides[2],
+        param.k_strides[1],
+        param.k_strides[3]};
+
+    std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {
+        param.B, param.Hq, param.N, param.K};
+    std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
+        param.tmp_grad_k_strides[0],
+        param.tmp_grad_k_strides[2],
+        param.tmp_grad_k_strides[1],
+        param.tmp_grad_k_strides[3]};
+
+    std::vector<ck::index_t> v_gs_os_ns_lengths{
+        param.B, param.Hkv, param.Kv, param.N};
+    std::vector<ck::index_t> v_gs_os_ns_strides{
+        param.v_strides[0],
+        param.v_strides[2],
+        param.v_strides[3],
+        param.v_strides[1]};
+
+    std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {
+        param.B, param.Hq, param.Kv, param.N};
+    std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
+        param.tmp_grad_v_strides[0],
+        param.tmp_grad_v_strides[2],
+        param.tmp_grad_v_strides[3],
+        param.tmp_grad_v_strides[1]};
+
+    std::vector<ck::index_t> y_gs_ms_os_lengths{
+        param.B, param.Hq, param.M, param.Kv};
+    std::vector<ck::index_t> y_gs_ms_os_strides{
+        param.out_strides[0],
+        param.out_strides[2],
+        param.out_strides[1],
+        param.out_strides[3]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
+
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
+      d_gs_ms_ns_strides = {
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
     };
+
+    float alpha = param.scale;
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptr,
+        param.k_ptr,
+        nullptr, // p_z_grid
+        param.v_ptr,
+        param.out_ptr,
+        param.logsumexp_ptr,
+        param.grad_out_ptr,
+        param.grad_q_ptr,
+        param.grad_k_ptr,
+        param.grad_v_ptr,
+        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+        nullptr, // p_acc1_bias
+        param.bias_has_grad ? param.grad_bias_ptr : nullptr,
+        nullptr,
+        q_gs_ms_ks_lengths, // q, dQ should have same shape
+        q_gs_ms_ks_strides,
+        k_gs_ns_ks_lengths, // k, dK should have same shape
+        k_gs_ns_ks_strides,
+        {1, 1, 1, 1}, // z_gs_ms_ns_lengths
+        {0, 0, 0, 0}, // z_gs_ms_ns_strides
+        v_gs_os_ns_lengths, // v, dV should have same shape
+        v_gs_os_ns_strides,
+        y_gs_ms_os_lengths, // y, dY should have same shape
+        y_gs_ms_os_strides,
+        lse_gs_ms_lengths,
+        param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
+        param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
+        param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
+        param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
+        d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides
+        QKVElementOp{},
+        QKVElementOp{},
+        Scale{alpha},
+        QKVElementOp{},
+        YElementOp{},
+        param.dropout_prob,
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
-void run_batched_backward_masktype_attnbias_dispatched(BatchedBackwardParams& param,
-                                                       hipStream_t stream)
-{
-    batched_backward_masktype_attnbias_dispatched<scalar_t,
-                                                  custom_mask_type,
-                                                  has_attn_bias,
-                                                  use_fp32_qkv_grad>::Run(param, stream);
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
+void run_batched_backward_masktype_attnbias_dispatched(
+    BatchedBackwardParams& param,
+    hipStream_t stream) {
+  batched_backward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias,
+      use_fp32_qkv_grad>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
index 774c3000c..4a589ae02 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
@@ -10,65 +10,104 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_backward.h"
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void
-run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_2(
-        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
-            if(param.custom_mask_type == 0)
-                run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t,
-                                                                  0,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            else if(param.custom_mask_type == 1)
-                run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t,
-                                                                  1,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            else if(param.custom_mask_type == 2)
-                run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t,
-                                                                  2,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0)
+          run_batched_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        else if (param.custom_mask_type == 1)
+          run_batched_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        else if (param.custom_mask_type == 2)
+          run_batched_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
index 3ffb86250..b218809be 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
@@ -10,62 +10,104 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_backward.h"
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
 
-void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_2(
-        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
-            if(param.custom_mask_type == 0)
-                run_batched_backward_masktype_attnbias_dispatched<ck::half_t,
-                                                                  0,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            else if(param.custom_mask_type == 1)
-                run_batched_backward_masktype_attnbias_dispatched<ck::half_t,
-                                                                  1,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            else if(param.custom_mask_type == 2)
-                run_batched_backward_masktype_attnbias_dispatched<ck::half_t,
-                                                                  2,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0)
+          run_batched_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        else if (param.custom_mask_type == 1)
+          run_batched_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        else if (param.custom_mask_type == 2)
+          run_batched_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
index 56dbb6523..f96a52d56 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
@@ -24,68 +24,65 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_forward_masktype_attnbias_dispatched
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    using GemmDataType     = scalar_t;
-    using ADataType        = scalar_t;
-    using B0DataType       = scalar_t;
-    using B1DataType       = scalar_t;
-    using AccDataType      = F32;
-    using CShuffleDataType = F32;
-    using CDataType        = scalar_t;
-    using ZDataType        = unsigned short;
-    using LSEDataType      = F32;
-    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
-    using Acc1BiasDataType = void;
-
-    static constexpr ck::index_t NumDimG = 2;
-    static constexpr ck::index_t NumDimM = 1;
-    static constexpr ck::index_t NumDimN = 1;
-    static constexpr ck::index_t NumDimK = 1;
-    static constexpr ck::index_t NumDimO = 1;
-
-    using AElementOp    = PassThrough;
-    using B0ElementOp   = PassThrough;
-    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-    using B1ElementOp   = PassThrough;
-    using CElementOp    = PassThrough;
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-    static constexpr auto MaskingSpec =
-        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
-
-    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct batched_forward_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_FORWARD_HEADDIM_SWITCH
-#define BATCHED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)       \
-    [&] {                                                               \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else                                                            \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 128; \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-    }()
+#define BATCHED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                           \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+      __VA_ARGS__();                                              \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+      __VA_ARGS__();                                              \
+    } else {                                                      \
+      constexpr ck::index_t kGemm1NPerBlock = 128;                \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+      __VA_ARGS__();                                              \
+    }                                                             \
+  }()
 #endif
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -164,201 +161,219 @@ struct batched_forward_masktype_attnbias_dispatched
           kCShuffleBlockTransferScalarPerVector,
           GemmOpConstantsBatchedForward::Acc1BiasTransferSrcScalarPerVector,
           MaskingSpec>;
-    // clang-format on
-
-    static constexpr auto I1 = ck::Number<1>{};
-    static constexpr auto I2 = ck::Number<2>{};
-    static constexpr auto I3 = ck::Number<3>{};
-
-    static void Run(BatchedForwardParams& param, hipStream_t stream)
-    {
-        using ck::math::min;
-
-        // compile-time constants which don't depend on head-dim switching
-        constexpr ck::index_t thread_slice_length_ak1 =
-            GemmOpConstantsBatchedForward::AK1 /
-            GemmOpConstantsBatchedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-        constexpr ck::index_t thread_slice_length_bk1 =
-            GemmOpConstantsBatchedForward::BK1 /
-            GemmOpConstantsBatchedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-                      "ThreadClusterLengths!");
-
-        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-            min(8, thread_slice_length_ak1);
-
-        BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-            constexpr ck::index_t thread_slice_length_gemm1n =
-                kGemm1NPerBlock /
-                GemmOpConstantsBatchedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(
-                    I1);
-            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_gemm1n);
-
-            constexpr ck::index_t thread_slice_length_cshuflle_n =
-                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                GemmOpConstantsBatchedForward::
-                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
-
-            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                min(4, thread_slice_length_cshuflle_n);
-
-            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
-                         kCShuffleBlockTransferScalarPerVector_max)
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kB1BlockTransferSrcScalarPerVector_max,
-                               kB1BlockTransferSrcScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                                       min(kB1BlockTransferSrcScalarPerVector,
-                                           kCShuffleBlockTransferScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            }
-            else
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kCShuffleBlockTransferScalarPerVector_max,
-                               kCShuffleBlockTransferScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                                       min(kCShuffleBlockTransferScalarPerVector,
-                                           kB1BlockTransferSrcScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            };
-        });
+  // clang-format on
+
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsBatchedForward::AK1 /
+        GemmOpConstantsBatchedForward::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsBatchedForward::BK1 /
+        GemmOpConstantsBatchedForward::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+        "ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(8, thread_slice_length_ak1);
+
+    BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsBatchedForward::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedForward::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{
+        param.B, param.Hq, param.M, param.K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        param.q_strides[0],
+        param.q_strides[2],
+        param.q_strides[1],
+        param.q_strides[3]};
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
+        param.B, param.Hkv, param.N, param.K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        param.k_strides[0],
+        param.k_strides[2],
+        param.k_strides[1],
+        param.k_strides[3]};
+
+    // to be changed to b1_gs_ns_os_lengths
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{
+        param.B, param.Hkv, param.Kv, param.N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        param.v_strides[0],
+        param.v_strides[2],
+        param.v_strides[3],
+        param.v_strides[1]};
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{
+        param.B, param.Hq, param.M, param.Kv};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        param.out_strides[0],
+        param.out_strides[2],
+        param.out_strides[1],
+        param.out_strides[3]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
+
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
+      d_gs_ms_ns_strides = {
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
     };
 
-    template <typename DeviceOpInstance>
-    static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream)
-    {
-        std::vector<ck::index_t> a_gs_ms_ks_lengths{param.B, param.Hq, param.M, param.K};
-        std::vector<ck::index_t> a_gs_ms_ks_strides{
-            param.q_strides[0], param.q_strides[2], param.q_strides[1], param.q_strides[3]};
-
-        std::vector<ck::index_t> b0_gs_ns_ks_lengths{param.B, param.Hkv, param.N, param.K};
-        std::vector<ck::index_t> b0_gs_ns_ks_strides{
-            param.k_strides[0], param.k_strides[2], param.k_strides[1], param.k_strides[3]};
-
-        // to be changed to b1_gs_ns_os_lengths
-        std::vector<ck::index_t> b1_gs_os_ns_lengths{param.B, param.Hkv, param.Kv, param.N};
-        std::vector<ck::index_t> b1_gs_os_ns_strides{
-            param.v_strides[0], param.v_strides[2], param.v_strides[3], param.v_strides[1]};
-
-        std::vector<ck::index_t> c_gs_ms_os_lengths{param.B, param.Hq, param.M, param.Kv};
-        std::vector<ck::index_t> c_gs_ms_os_strides{
-            param.out_strides[0], param.out_strides[2], param.out_strides[1], param.out_strides[3]};
-
-        std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-        std::vector<ck::index_t> d_gs_ms_ns_lengths;
-        std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-        if constexpr(has_attn_bias)
-        {
-            d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-            d_gs_ms_ns_strides = {param.attn_bias_strides[0],
-                                  param.attn_bias_strides[1],
-                                  param.attn_bias_strides[2],
-                                  param.attn_bias_strides[3]};
-        }
-        else
-        {
-            d_gs_ms_ns_lengths = {1, 1, 1, 1};
-            d_gs_ms_ns_strides = {0, 0, 0, 0};
-        };
-
-        float alpha = param.scale;
-
-        auto a_element_op    = AElementOp{};
-        auto b0_element_op   = B0ElementOp{};
-        auto acc0_element_op = Acc0ElementOp{alpha};
-        auto b1_element_op   = B1ElementOp{};
-        auto c_element_op    = CElementOp{};
-
-        auto op      = DeviceOpInstance{};
-        auto invoker = op.MakeInvoker();
-
-        auto arg_ptr = op.MakeArgumentPointer(
-            param.q_ptr,
-            param.k_ptr,
-            param.v_ptr,
-            param.out_ptr,
-            nullptr,
-            param.logsumexp_ptr,
-            param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-            {}, // p_acc1_biases;
-            a_gs_ms_ks_lengths,
-            a_gs_ms_ks_strides,
-            b0_gs_ns_ks_lengths,
-            b0_gs_ns_ks_strides,
-            b1_gs_os_ns_lengths,
-            b1_gs_os_ns_strides,
-            c_gs_ms_os_lengths,
-            c_gs_ms_os_strides,
-            {1, 1, 1, 1},
-            {0, 0, 0, 0},
-            lse_gs_ms_lengths,
-            d_gs_ms_ns_lengths,
-            d_gs_ms_ns_strides,
-            {}, // acc1_biases_gs_ms_os_lengths
-            {}, // acc1_biases_gs_ms_os_strides,
-            a_element_op,
-            b0_element_op,
-            acc0_element_op,
-            b1_element_op,
-            c_element_op,
-            param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-            std::tuple<int64_t, int64_t>(param.philox_seed,
-                                         param.philox_offset)); // dropout random seed and offset
-
-        SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-        if(!op.IsSupportedArgument(arg_ptr.get()))
-        {
-            std::ostringstream ostr;
-
-            ostr << op.GetTypeString() << " does not support this problem";
-
-            throw std::runtime_error(ostr.str());
-        }
-
-        invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-    };
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptr,
+        param.k_ptr,
+        param.v_ptr,
+        param.out_ptr,
+        nullptr,
+        param.logsumexp_ptr,
+        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+        {}, // p_acc1_biases;
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        {1, 1, 1, 1},
+        {0, 0, 0, 0},
+        lse_gs_ms_lengths,
+        d_gs_ms_ns_lengths,
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op,
+        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+        std::tuple<int64_t, int64_t>(
+            param.philox_seed,
+            param.philox_offset)); // dropout random seed and offset
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_forward_masktype_attnbias_dispatched(BatchedForwardParams& param,
-                                                      hipStream_t stream)
-{
-    batched_forward_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
-        param, stream);
+void run_batched_forward_masktype_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  batched_forward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
index 362379dd0..6cc45e3a2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_forward.h"
 
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
-                                                                        hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
-                                                                        hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
-                                                                        hipStream_t stream);
-
-void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                            stream);
-        else if(param.custom_mask_type == 1)
-            run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                            stream);
-        else if(param.custom_mask_type == 2)
-            run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                            stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_batched_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_batched_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_batched_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
index 1d42798c8..e153cfa3c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_forward.h"
 
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
-
-void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                           stream);
-        else if(param.custom_mask_type == 1)
-            run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                           stream);
-        else if(param.custom_mask_type == 2)
-            run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                           stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_batched_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_batched_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_batched_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
index af7c7679c..c72fce2d5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
@@ -24,62 +24,59 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_infer_masktype_attnbias_dispatched
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    using GemmDataType     = scalar_t;
-    using ADataType        = scalar_t;
-    using B0DataType       = scalar_t;
-    using B1DataType       = scalar_t;
-    using AccDataType      = F32;
-    using CShuffleDataType = F32;
-    using CDataType        = scalar_t;
-    using ZDataType        = unsigned short;
-    using LSEDataType      = F32;
-    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
-    using Acc1BiasDataType = void;
-
-    using AElementOp    = PassThrough;
-    using B0ElementOp   = PassThrough;
-    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-    using B1ElementOp   = PassThrough;
-    using CElementOp    = PassThrough;
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-    static constexpr auto MaskingSpec =
-        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
-
-    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct batched_infer_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef BATCHED_INFER_HEADDIM_SWITCH
-#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)         \
-    [&] {                                                               \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else                                                            \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 128; \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-    }()
+#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                         \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
+      constexpr ck::index_t kGemm1NPerBlock = 32;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
+      __VA_ARGS__();                                            \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
+      constexpr ck::index_t kGemm1NPerBlock = 64;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
+      __VA_ARGS__();                                            \
+    } else {                                                    \
+      constexpr ck::index_t kGemm1NPerBlock = 128;              \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
+      __VA_ARGS__();                                            \
+    }                                                           \
+  }()
 #endif
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -153,190 +150,210 @@ struct batched_infer_masktype_attnbias_dispatched
           GemmOpConstantsBatchedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
-    // clang-format on
-
-    static constexpr auto I1 = ck::Number<1>{};
-    static constexpr auto I2 = ck::Number<2>{};
-    static constexpr auto I3 = ck::Number<3>{};
-
-    static void Run(BatchedForwardParams& param, hipStream_t stream)
-    {
-        using ck::math::min;
-
-        // compile-time constants which don't depend on head-dim switching
-        constexpr ck::index_t thread_slice_length_ak1 =
-            GemmOpConstantsBatchedInfer::AK1 /
-            GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-        constexpr ck::index_t thread_slice_length_bk1 =
-            GemmOpConstantsBatchedInfer::BK1 /
-            GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-                      "ThreadClusterLengths!");
-
-        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-            min(8, thread_slice_length_ak1);
-
-        BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-            constexpr ck::index_t thread_slice_length_gemm1n =
-                kGemm1NPerBlock /
-                GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-                min(4, thread_slice_length_gemm1n);
-
-            constexpr ck::index_t thread_slice_length_cshuflle_n =
-                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                GemmOpConstantsBatchedInfer::
-                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
-
-            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                min(4, thread_slice_length_cshuflle_n);
-
-            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
-                         kCShuffleBlockTransferScalarPerVector_max)
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kB1BlockTransferSrcScalarPerVector_max,
-                               kB1BlockTransferSrcScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                                       min(kB1BlockTransferSrcScalarPerVector,
-                                           kCShuffleBlockTransferScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            }
-            else
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kCShuffleBlockTransferScalarPerVector_max,
-                               kCShuffleBlockTransferScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                                       min(kCShuffleBlockTransferScalarPerVector,
-                                           kB1BlockTransferSrcScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            };
-        });
+  // clang-format on
+
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsBatchedInfer::AK1 /
+        GemmOpConstantsBatchedInfer::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsBatchedInfer::BK1 /
+        GemmOpConstantsBatchedInfer::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+        "ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(8, thread_slice_length_ak1);
+
+    BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsBatchedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsBatchedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{
+        param.B, param.Hq, param.M, param.K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        param.q_strides[0],
+        param.q_strides[2],
+        param.q_strides[1],
+        param.q_strides[3]};
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
+        param.B, param.Hkv, param.N, param.K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        param.k_strides[0],
+        param.k_strides[2],
+        param.k_strides[1],
+        param.k_strides[3]};
+
+    // to be changed to b1_gs_ns_os_lengths
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{
+        param.B, param.Hkv, param.Kv, param.N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        param.v_strides[0],
+        param.v_strides[2],
+        param.v_strides[3],
+        param.v_strides[1]};
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{
+        param.B, param.Hq, param.M, param.Kv};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        param.out_strides[0],
+        param.out_strides[2],
+        param.out_strides[1],
+        param.out_strides[3]};
+
+    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
+
+    std::vector<ck::index_t> d_gs_ms_ns_lengths;
+    std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+    if constexpr (has_attn_bias) {
+      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
+      d_gs_ms_ns_strides = {
+          param.attn_bias_strides[0],
+          param.attn_bias_strides[1],
+          param.attn_bias_strides[2],
+          param.attn_bias_strides[3]};
+    } else {
+      d_gs_ms_ns_lengths = {1, 1, 1, 1};
+      d_gs_ms_ns_strides = {0, 0, 0, 0};
     };
 
-    template <typename DeviceOpInstance>
-    static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream)
-    {
-        std::vector<ck::index_t> a_gs_ms_ks_lengths{param.B, param.Hq, param.M, param.K};
-        std::vector<ck::index_t> a_gs_ms_ks_strides{
-            param.q_strides[0], param.q_strides[2], param.q_strides[1], param.q_strides[3]};
-
-        std::vector<ck::index_t> b0_gs_ns_ks_lengths{param.B, param.Hkv, param.N, param.K};
-        std::vector<ck::index_t> b0_gs_ns_ks_strides{
-            param.k_strides[0], param.k_strides[2], param.k_strides[1], param.k_strides[3]};
-
-        // to be changed to b1_gs_ns_os_lengths
-        std::vector<ck::index_t> b1_gs_os_ns_lengths{param.B, param.Hkv, param.Kv, param.N};
-        std::vector<ck::index_t> b1_gs_os_ns_strides{
-            param.v_strides[0], param.v_strides[2], param.v_strides[3], param.v_strides[1]};
-
-        std::vector<ck::index_t> c_gs_ms_os_lengths{param.B, param.Hq, param.M, param.Kv};
-        std::vector<ck::index_t> c_gs_ms_os_strides{
-            param.out_strides[0], param.out_strides[2], param.out_strides[1], param.out_strides[3]};
-
-        std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-        std::vector<ck::index_t> d_gs_ms_ns_lengths;
-        std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-        if constexpr(has_attn_bias)
-        {
-            d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-            d_gs_ms_ns_strides = {param.attn_bias_strides[0],
-                                  param.attn_bias_strides[1],
-                                  param.attn_bias_strides[2],
-                                  param.attn_bias_strides[3]};
-        }
-        else
-        {
-            d_gs_ms_ns_lengths = {1, 1, 1, 1};
-            d_gs_ms_ns_strides = {0, 0, 0, 0};
-        };
-
-        float alpha = param.scale;
-
-        auto a_element_op    = AElementOp{};
-        auto b0_element_op   = B0ElementOp{};
-        auto acc0_element_op = Acc0ElementOp{alpha};
-        auto b1_element_op   = B1ElementOp{};
-        auto c_element_op    = CElementOp{};
-
-        auto op      = DeviceOpInstance{};
-        auto invoker = op.MakeInvoker();
-
-        auto arg_ptr = op.MakeArgumentPointer(param.q_ptr,
-                                              param.k_ptr,
-                                              param.v_ptr,
-                                              param.out_ptr,
-                                              param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-                                              {}, // p_acc1_biases;
-                                              a_gs_ms_ks_lengths,
-                                              a_gs_ms_ks_strides,
-                                              b0_gs_ns_ks_lengths,
-                                              b0_gs_ns_ks_strides,
-                                              b1_gs_os_ns_lengths,
-                                              b1_gs_os_ns_strides,
-                                              c_gs_ms_os_lengths,
-                                              c_gs_ms_os_strides,
-                                              d_gs_ms_ns_lengths,
-                                              d_gs_ms_ns_strides,
-                                              {}, // acc1_biases_gs_ms_os_lengths
-                                              {}, // acc1_biases_gs_ms_os_strides,
-                                              a_element_op,
-                                              b0_element_op,
-                                              acc0_element_op,
-                                              b1_element_op,
-                                              c_element_op);
-
-        SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-        if(!op.IsSupportedArgument(arg_ptr.get()))
-        {
-            std::ostringstream ostr;
-
-            ostr << op.GetTypeString() << " does not support this problem";
-
-            throw std::runtime_error(ostr.str());
-        }
-
-        invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-    };
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptr,
+        param.k_ptr,
+        param.v_ptr,
+        param.out_ptr,
+        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
+        {}, // p_acc1_biases;
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        d_gs_ms_ns_lengths,
+        d_gs_ms_ns_strides,
+        {}, // acc1_biases_gs_ms_os_lengths
+        {}, // acc1_biases_gs_ms_os_strides,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op);
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_infer_masktype_attnbias_dispatched(BatchedForwardParams& param, hipStream_t stream)
-{
-    batched_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
-        param, stream);
+void run_batched_infer_masktype_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  batched_infer_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
index 1530aad32..03a2e36ca 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_infer.h"
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
-
-void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
-        else if(param.custom_mask_type == 1)
-            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
-        else if(param.custom_mask_type == 2)
-            run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
index 52b385aa2..4d0625a46 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_batched_infer.h"
 
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
-
-void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
-        else if(param.custom_mask_type == 1)
-            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
-        else if(param.custom_mask_type == 2)
-            run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
+
+void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_batched_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
index 6362916ae..1fdabf29f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
@@ -10,18 +10,19 @@
 #include "ck_fmha_op_helper.h"
 
 // list the template parameters that is commonly used
-struct GemmOpConstantsCommon
-{
-    static constexpr ck::index_t NumDimG = 2;
-    static constexpr ck::index_t NumDimM = 1;
-    static constexpr ck::index_t NumDimN = 1;
-    static constexpr ck::index_t NumDimK = 1;
-    static constexpr ck::index_t NumDimO = 1;
+struct GemmOpConstantsCommon {
+  static constexpr ck::index_t NumDimG = 2;
+  static constexpr ck::index_t NumDimM = 1;
+  static constexpr ck::index_t NumDimN = 1;
+  static constexpr ck::index_t NumDimK = 1;
+  static constexpr ck::index_t NumDimO = 1;
 
-    static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default;
-    static constexpr auto TensorSpecB0 =
-        ck::tensor_operation::device::TensorSpecialization::Default;
-    static constexpr auto TensorSpecB1 =
-        ck::tensor_operation::device::TensorSpecialization::Default;
-    static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecA =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB0 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecB1 =
+      ck::tensor_operation::device::TensorSpecialization::Default;
+  static constexpr auto TensorSpecC =
+      ck::tensor_operation::device::TensorSpecialization::Default;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
index 2fb06ddd8..b2866cc4f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
@@ -24,56 +24,60 @@
 #include "ck_fmha_op_helper.h"
 #include "ck_fmha_params.h"
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
-struct grouped_backward_masktype_attnbias_dispatched
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using Scale       = ck::tensor_operation::element_wise::Scale;
-
-    using QKVElementOp = PassThrough;
-    using YElementOp   = PassThrough;
-
-    using InputDataType    = scalar_t;
-    using OutputDataType   = typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
-    using GemmDataType     = scalar_t;
-    using AccDataType      = F32;
-    using ShuffleDataType  = F32;
-    using LSEDataType      = F32;
-    using ZDataType        = unsigned short;
-    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
-    using Acc1BiasDataType = void;
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-    static constexpr auto MaskingSpec =
-        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
-
-    static constexpr bool Deterministic = true;
-
-    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
+struct grouped_backward_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+  using Scale = ck::tensor_operation::element_wise::Scale;
+
+  using QKVElementOp = PassThrough;
+  using YElementOp = PassThrough;
+
+  using InputDataType = scalar_t;
+  using OutputDataType =
+      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
+  using GemmDataType = scalar_t;
+  using AccDataType = F32;
+  using ShuffleDataType = F32;
+  using LSEDataType = F32;
+  using ZDataType = unsigned short;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr bool Deterministic = true;
+
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef GROUPED_BACKWARD_V1_HEADDIM_SWITCH
-#define GROUPED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)              \
-    [&] {                                                                          \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                                     \
-        {                                                                          \
-            constexpr ck::index_t kGemm1NPerBlock                = 32;             \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 1;              \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;              \
-            using kCShuffleBlockTransferClusterLengths           = S<1, 64, 1, 4>; \
-            __VA_ARGS__();                                                         \
-        }                                                                          \
-        else                                                                       \
-        {                                                                          \
-            constexpr ck::index_t kGemm1NPerBlock                = 64;             \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 2;              \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;              \
-            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>; \
-            __VA_ARGS__();                                                         \
-        };                                                                         \
-    }()
+#define GROUPED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                               \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
+      __VA_ARGS__();                                                  \
+    } else {                                                          \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
+      __VA_ARGS__();                                                  \
+    };                                                                \
+  }()
 #endif
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -144,9 +148,9 @@ struct grouped_backward_masktype_attnbias_dispatched
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-    // clang-format on
+  // clang-format on
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -225,294 +229,297 @@ struct grouped_backward_masktype_attnbias_dispatched
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec,
           Deterministic>;
-    // clang-format on
-
-    static constexpr auto I1 = ck::Number<1>{};
-    static constexpr auto I2 = ck::Number<2>{};
-    static constexpr auto I3 = ck::Number<3>{};
-
-    static void Run(GroupedBackwardParams& param, hipStream_t stream)
-    {
-        using ck::math::min;
-
-        if(param.K <= 64 && param.Kv <= 64)
-        {
-            // compile-time constants which don't depend on head-dim switching
-            constexpr ck::index_t thread_slice_length_ak1 =
-                GemmOpConstantsGroupedBackward_V1::AK1 /
-                GemmOpConstantsGroupedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
-                    I2);
-            constexpr ck::index_t thread_slice_length_bk1 =
-                GemmOpConstantsGroupedBackward_V1::BK1 /
-                GemmOpConstantsGroupedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
-                    I2);
-
-            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-                          "and ThreadClusterLengths!");
-
-            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_ak1);
-
-            GROUPED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-                constexpr ck::index_t thread_slice_length_cshuflle_n =
-                    (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                    kCShuffleBlockTransferClusterLengths::At(I3);
-
-                constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                    min(2, thread_slice_length_cshuflle_n);
-
-                ALIGN_SWITCH_2(
-                    kABBlockTransferSrcScalarPerVector_max,
-                    kABBlockTransferSrcScalarPerVector,
-                    param.K,
-                    kCShuffleBlockTransferScalarPerVector_max,
-                    kCShuffleBlockTransferScalarPerVector,
-                    param.Kv,
-                    [&] {
-                        using DeviceOpInstance =
-                            DeviceOpInstanceTemp_V1<kGemm1NPerBlock,
-                                                    kGemm1NXdlPerWave,
-                                                    kCShuffleNXdlPerWavePerShuffle,
-                                                    kCShuffleBlockTransferClusterLengths,
-                                                    kABBlockTransferSrcScalarPerVector,
-                                                    kCShuffleBlockTransferScalarPerVector>;
-
-                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                    });
+  // clang-format on
+
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
+  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    if (param.K <= 64 && param.Kv <= 64) {
+      // compile-time constants which don't depend on head-dim switching
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsGroupedBackward_V1::AK1 /
+          GemmOpConstantsGroupedBackward_V1::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsGroupedBackward_V1::BK1 /
+          GemmOpConstantsGroupedBackward_V1::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+          "and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      GROUPED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+        constexpr ck::index_t thread_slice_length_cshuflle_n =
+            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+             kGemm1NXdlPerWave) /
+            kCShuffleBlockTransferClusterLengths::At(I3);
+
+        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+            min(2, thread_slice_length_cshuflle_n);
+
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
             });
-        }
-        else
-        {
-            constexpr ck::index_t kGemm1NPerBlock                = 128;
-            constexpr ck::index_t kGemm1NXdlPerWave              = 4;
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-            using kCShuffleBlockTransferClusterLengths           = S<1, 32, 1, 8>;
-
-            constexpr ck::index_t thread_slice_length_ak1 =
-                GemmOpConstantsGroupedBackward_V2::AK1 /
-                GemmOpConstantsGroupedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(
-                    I2);
-            constexpr ck::index_t thread_slice_length_bk1 =
-                GemmOpConstantsGroupedBackward_V2::BK1 /
-                GemmOpConstantsGroupedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(
-                    I2);
-
-            static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-                          "and ThreadClusterLengths!");
-
-            constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_ak1);
-
-            constexpr ck::index_t thread_slice_length_gemm1n =
-                kGemm1NPerBlock / GemmOpConstantsGroupedBackward_V2::
-                                      B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_gemm1n);
-
-            constexpr ck::index_t thread_slice_length_cshuflle_n =
-                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                kCShuffleBlockTransferClusterLengths::At(I3);
-
-            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                min(2, thread_slice_length_cshuflle_n);
-
-            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
-                         kCShuffleBlockTransferScalarPerVector_max)
-            {
-                ALIGN_SWITCH_2(
-                    kABBlockTransferSrcScalarPerVector_max,
-                    kABBlockTransferSrcScalarPerVector,
-                    param.K,
-                    kB1BlockTransferSrcScalarPerVector_max,
-                    kB1BlockTransferSrcScalarPerVector,
-                    param.Kv,
-                    [&] {
-                        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                            min(kB1BlockTransferSrcScalarPerVector,
-                                kCShuffleBlockTransferScalarPerVector_max);
-                        using DeviceOpInstance =
-                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
-                                                    kGemm1NXdlPerWave,
-                                                    kCShuffleNXdlPerWavePerShuffle,
-                                                    kCShuffleBlockTransferClusterLengths,
-                                                    kABBlockTransferSrcScalarPerVector,
-                                                    kB1BlockTransferSrcScalarPerVector,
-                                                    kCShuffleBlockTransferScalarPerVector>;
-
-                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                    });
-            }
-            else
-            {
-                ALIGN_SWITCH_2(
-                    kABBlockTransferSrcScalarPerVector_max,
-                    kABBlockTransferSrcScalarPerVector,
-                    param.K,
-                    kCShuffleBlockTransferScalarPerVector_max,
-                    kCShuffleBlockTransferScalarPerVector,
-                    param.Kv,
-                    [&] {
-                        constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                            min(kCShuffleBlockTransferScalarPerVector,
-                                kB1BlockTransferSrcScalarPerVector_max);
-                        using DeviceOpInstance =
-                            DeviceOpInstanceTemp_V2<kGemm1NPerBlock,
-                                                    kGemm1NXdlPerWave,
-                                                    kCShuffleNXdlPerWavePerShuffle,
-                                                    kCShuffleBlockTransferClusterLengths,
-                                                    kABBlockTransferSrcScalarPerVector,
-                                                    kB1BlockTransferSrcScalarPerVector,
-                                                    kCShuffleBlockTransferScalarPerVector>;
-
-                        RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                    });
-            };
-        };
-    };
-
-    template <typename DeviceOpInstance>
-    static void RunWithDeviceOp(GroupedBackwardParams& param, hipStream_t stream)
-    {
-        // Tunables
-        std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-        for(std::size_t i = 0; i < param.num_batches; i++)
-        {
-            int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
-            int N = param.host_seqlen_k.empty()
-                        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-                        : param.host_seqlen_k[i];
-            int K    = param.K;
-            int Kv   = param.Kv;
-            int G1q  = param.Hq;
-            int G1kv = param.Hkv;
-
-            std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1q, M, K};
-            std::vector<ck::index_t> q_gs_ms_ks_strides{
-                0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-            std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1kv, N, K};
-            std::vector<ck::index_t> k_gs_ns_ks_strides{
-                0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-            std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {1, G1q, N, K};
-            std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {0,
-                                                               param.tmp_grad_k_strides[1],
-                                                               param.tmp_grad_k_strides[0],
-                                                               param.tmp_grad_k_strides[2]};
-
-            // to be changed to v_gs_ns_os_lengths
-            std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1kv, Kv, N};
-            std::vector<ck::index_t> v_gs_os_ns_strides{
-                0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-            std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {1, G1q, Kv, N};
-            std::vector<ck::index_t> vgrad_gs_os_ns_strides = {0,
-                                                               param.tmp_grad_v_strides[1],
-                                                               param.tmp_grad_v_strides[2],
-                                                               param.tmp_grad_v_strides[0]};
-
-            std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1q, M, Kv};
-            std::vector<ck::index_t> y_gs_ms_os_strides{
-                0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-            std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
-            std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-            std::vector<ck::index_t> d_gs_ms_ns_lengths;
-            std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-            if constexpr(has_attn_bias)
-            {
-                d_gs_ms_ns_lengths = {1, G1q, M, N};
-                d_gs_ms_ns_strides = {0,
-                                      param.attn_bias_strides[0],
-                                      param.attn_bias_strides[1],
-                                      param.attn_bias_strides[2]};
-            }
-            else
-            {
-                d_gs_ms_ns_lengths = {1, 1, 1, 1};
-                d_gs_ms_ns_strides = {0, 0, 0, 0};
-            };
-
-            problem_descs.push_back({
-                q_gs_ms_ks_lengths, // q, dQ should have same shape
-                q_gs_ms_ks_strides,
-                k_gs_ns_ks_lengths, // k, dK should have same shape
-                k_gs_ns_ks_strides,
-                {1, 1, 1, 1},
-                {0, 0, 0, 0},
-                v_gs_os_ns_lengths, // v, dV should have same shape
-                v_gs_os_ns_strides,
-                y_gs_ms_os_lengths, // y, dY should have same shape
-                y_gs_ms_os_strides,
-                lse_gs_ms_lengths,
-                lse_gs_ms_strides,
-                param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
-                param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
-                param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
-                param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
-                d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-                d_gs_ms_ns_strides,
-                {}, // acc1_biases_gs_ms_os_lengths
-                {}, // acc1_biases_gs_ms_os_strides
+      });
+    } else {
+      constexpr ck::index_t kGemm1NPerBlock = 128;
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
+      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
+
+      constexpr ck::index_t thread_slice_length_ak1 =
+          GemmOpConstantsGroupedBackward_V2::AK1 /
+          GemmOpConstantsGroupedBackward_V2::
+              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+      constexpr ck::index_t thread_slice_length_bk1 =
+          GemmOpConstantsGroupedBackward_V2::BK1 /
+          GemmOpConstantsGroupedBackward_V2::
+              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+      static_assert(
+          thread_slice_length_ak1 == thread_slice_length_bk1,
+          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
+          "and ThreadClusterLengths!");
+
+      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_ak1);
+
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsGroupedBackward_V2::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          kCShuffleBlockTransferClusterLengths::At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(2, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kCShuffleBlockTransferClusterLengths,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
             });
-        }
-
-        float alpha = param.scale;
-
-        auto op      = DeviceOpInstance{};
-        auto invoker = op.MakeInvoker();
-
-        auto arg_ptr = op.MakeArgumentPointer(
-            param.q_ptrs,
-            param.k_ptrs,
-            param.randvals_ptrs,
-            param.v_ptrs,
-            param.out_ptrs,
-            param.logsumexp_ptrs,
-            param.grad_out_ptrs,
-            param.grad_q_ptrs,
-            param.grad_k_ptrs,
-            param.grad_v_ptrs,
-            param.attn_bias_ptrs,
-            {}, // p_acc1_bias_vec;
-            param.grad_bias_ptrs,
-            {},
-            problem_descs,
-            QKVElementOp{},
-            QKVElementOp{},
-            Scale{alpha},
-            QKVElementOp{},
-            YElementOp{},
-            param.dropout_prob,
-            std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-        SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-        if(!op.IsSupportedArgument(arg_ptr.get()))
-        {
-            std::ostringstream ostr;
-            ostr << op.GetTypeString() << " does not support this problem";
-
-            throw std::runtime_error(ostr.str());
-        }
-
-        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+      };
     };
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(
+      GroupedBackwardParams& param,
+      hipStream_t stream) {
+    // Tunables
+    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+    for (std::size_t i = 0; i < param.num_batches; i++) {
+      int M =
+          param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
+      int N = param.host_seqlen_k.empty()
+          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+          : param.host_seqlen_k[i];
+      int K = param.K;
+      int Kv = param.Kv;
+      int G1q = param.Hq;
+      int G1kv = param.Hkv;
+
+      std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1q, M, K};
+      std::vector<ck::index_t> q_gs_ms_ks_strides{
+          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+      std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1kv, N, K};
+      std::vector<ck::index_t> k_gs_ns_ks_strides{
+          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+      std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {1, G1q, N, K};
+      std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
+          0,
+          param.tmp_grad_k_strides[1],
+          param.tmp_grad_k_strides[0],
+          param.tmp_grad_k_strides[2]};
+
+      // to be changed to v_gs_ns_os_lengths
+      std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1kv, Kv, N};
+      std::vector<ck::index_t> v_gs_os_ns_strides{
+          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+      std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {1, G1q, Kv, N};
+      std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
+          0,
+          param.tmp_grad_v_strides[1],
+          param.tmp_grad_v_strides[2],
+          param.tmp_grad_v_strides[0]};
+
+      std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1q, M, Kv};
+      std::vector<ck::index_t> y_gs_ms_os_strides{
+          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
+      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+      std::vector<ck::index_t> d_gs_ms_ns_lengths;
+      std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+      if constexpr (has_attn_bias) {
+        d_gs_ms_ns_lengths = {1, G1q, M, N};
+        d_gs_ms_ns_strides = {
+            0,
+            param.attn_bias_strides[0],
+            param.attn_bias_strides[1],
+            param.attn_bias_strides[2]};
+      } else {
+        d_gs_ms_ns_lengths = {1, 1, 1, 1};
+        d_gs_ms_ns_strides = {0, 0, 0, 0};
+      };
+
+      problem_descs.push_back({
+          q_gs_ms_ks_lengths, // q, dQ should have same shape
+          q_gs_ms_ks_strides,
+          k_gs_ns_ks_lengths, // k, dK should have same shape
+          k_gs_ns_ks_strides,
+          {1, 1, 1, 1},
+          {0, 0, 0, 0},
+          v_gs_os_ns_lengths, // v, dV should have same shape
+          v_gs_os_ns_strides,
+          y_gs_ms_os_lengths, // y, dY should have same shape
+          y_gs_ms_os_strides,
+          lse_gs_ms_lengths,
+          lse_gs_ms_strides,
+          param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
+          param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
+          param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
+          param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
+          d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
+          d_gs_ms_ns_strides,
+          {}, // acc1_biases_gs_ms_os_lengths
+          {}, // acc1_biases_gs_ms_os_strides
+      });
+    }
+
+    float alpha = param.scale;
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptrs,
+        param.k_ptrs,
+        param.randvals_ptrs,
+        param.v_ptrs,
+        param.out_ptrs,
+        param.logsumexp_ptrs,
+        param.grad_out_ptrs,
+        param.grad_q_ptrs,
+        param.grad_k_ptrs,
+        param.grad_v_ptrs,
+        param.attn_bias_ptrs,
+        {}, // p_acc1_bias_vec;
+        param.grad_bias_ptrs,
+        {},
+        problem_descs,
+        QKVElementOp{},
+        QKVElementOp{},
+        Scale{alpha},
+        QKVElementOp{},
+        YElementOp{},
+        param.dropout_prob,
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
 
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias, bool use_fp32_qkv_grad>
-void run_grouped_backward_masktype_attnbias_dispatched(GroupedBackwardParams& param,
-                                                       hipStream_t stream)
-{
-    grouped_backward_masktype_attnbias_dispatched<scalar_t,
-                                                  custom_mask_type,
-                                                  has_attn_bias,
-                                                  use_fp32_qkv_grad>::Run(param, stream);
+template <
+    typename scalar_t,
+    int32_t custom_mask_type,
+    bool has_attn_bias,
+    bool use_fp32_qkv_grad>
+void run_grouped_backward_masktype_attnbias_dispatched(
+    GroupedBackwardParams& param,
+    hipStream_t stream) {
+  grouped_backward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias,
+      use_fp32_qkv_grad>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
index 7d4458899..0e3f4f8fa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
@@ -10,71 +10,104 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_backward.h"
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void
-run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_2(
-        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
-            if(param.custom_mask_type == 0)
-            {
-                run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t,
-                                                                  0,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            }
-            else if(param.custom_mask_type == 1)
-            {
-                run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t,
-                                                                  1,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            }
-            else if(param.custom_mask_type == 2)
-            {
-                run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t,
-                                                                  2,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            }
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0) {
+          run_grouped_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        } else if (param.custom_mask_type == 1) {
+          run_grouped_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        } else if (param.custom_mask_type == 2) {
+          run_grouped_backward_masktype_attnbias_dispatched<
+              ck::bhalf_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        } else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
index a89291891..ca8a0a4d3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
@@ -10,68 +10,104 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_backward.h"
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
 
-void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_2(
-        param.has_attn_bias, HAS_ATTN_BIAS, param.use_fp32_qkv_grad, USE_FP32_QKV_GRAD, [&] {
-            if(param.custom_mask_type == 0)
-            {
-                run_grouped_backward_masktype_attnbias_dispatched<ck::half_t,
-                                                                  0,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            }
-            else if(param.custom_mask_type == 1)
-            {
-                run_grouped_backward_masktype_attnbias_dispatched<ck::half_t,
-                                                                  1,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            }
-            else if(param.custom_mask_type == 2)
-            {
-                run_grouped_backward_masktype_attnbias_dispatched<ck::half_t,
-                                                                  2,
-                                                                  HAS_ATTN_BIAS,
-                                                                  USE_FP32_QKV_GRAD>(param, stream);
-            }
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.use_fp32_qkv_grad,
+      USE_FP32_QKV_GRAD,
+      [&] {
+        if (param.custom_mask_type == 0) {
+          run_grouped_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              0,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        } else if (param.custom_mask_type == 1) {
+          run_grouped_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              1,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        } else if (param.custom_mask_type == 2) {
+          run_grouped_backward_masktype_attnbias_dispatched<
+              ck::half_t,
+              2,
+              HAS_ATTN_BIAS,
+              USE_FP32_QKV_GRAD>(param, stream);
+        } else
+          throw std::runtime_error("Invalid custom_mask_type value");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
index 997b92dd6..0095ec2a7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
@@ -24,62 +24,59 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_forward_masktype_attnbias_dispatched
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    using GemmDataType     = scalar_t;
-    using ADataType        = scalar_t;
-    using B0DataType       = scalar_t;
-    using B1DataType       = scalar_t;
-    using AccDataType      = F32;
-    using CShuffleDataType = F32;
-    using CDataType        = scalar_t;
-    using ZDataType        = unsigned short;
-    using LSEDataType      = F32;
-    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
-    using Acc1BiasDataType = void;
-
-    using AElementOp    = PassThrough;
-    using B0ElementOp   = PassThrough;
-    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-    using B1ElementOp   = PassThrough;
-    using CElementOp    = PassThrough;
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-    static constexpr auto MaskingSpec =
-        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
-
-    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct grouped_forward_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef GROUPED_FORWARD_HEADDIM_SWITCH
-#define GROUPED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)       \
-    [&] {                                                               \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else                                                            \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 128; \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-    }()
+#define GROUPED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                           \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
+      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
+      __VA_ARGS__();                                              \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
+      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
+      __VA_ARGS__();                                              \
+    } else {                                                      \
+      constexpr ck::index_t kGemm1NPerBlock = 128;                \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
+      __VA_ARGS__();                                              \
+    }                                                             \
+  }()
 #endif
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -158,220 +155,221 @@ struct grouped_forward_masktype_attnbias_dispatched
           kCShuffleBlockTransferScalarPerVector,
           GemmOpConstantsGroupedForward::Acc1BiasTransferSrcScalarPerVector,
           MaskingSpec>;
-    // clang-format on
-
-    static constexpr auto I1 = ck::Number<1>{};
-    static constexpr auto I2 = ck::Number<2>{};
-    static constexpr auto I3 = ck::Number<3>{};
-
-    static void Run(GroupedForwardParams& param, hipStream_t stream)
-    {
-        using ck::math::min;
-
-        // compile-time constants which don't depend on head-dim switching
-        constexpr ck::index_t thread_slice_length_ak1 =
-            GemmOpConstantsGroupedForward::AK1 /
-            GemmOpConstantsGroupedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-        constexpr ck::index_t thread_slice_length_bk1 =
-            GemmOpConstantsGroupedForward::BK1 /
-            GemmOpConstantsGroupedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-                      "ThreadClusterLengths!");
-
-        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-            min(8, thread_slice_length_ak1);
-
-        GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-            constexpr ck::index_t thread_slice_length_gemm1n =
-                kGemm1NPerBlock /
-                GemmOpConstantsGroupedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(
-                    I1);
-            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-                min(2, thread_slice_length_gemm1n);
-
-            constexpr ck::index_t thread_slice_length_cshuflle_n =
-                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                GemmOpConstantsGroupedForward::
-                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
-
-            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                min(4, thread_slice_length_cshuflle_n);
-
-            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
-                         kCShuffleBlockTransferScalarPerVector_max)
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kB1BlockTransferSrcScalarPerVector_max,
-                               kB1BlockTransferSrcScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                                       min(kB1BlockTransferSrcScalarPerVector,
-                                           kCShuffleBlockTransferScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            }
-            else
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kCShuffleBlockTransferScalarPerVector_max,
-                               kCShuffleBlockTransferScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                                       min(kCShuffleBlockTransferScalarPerVector,
-                                           kB1BlockTransferSrcScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            };
-        });
-    };
-
-    template <typename DeviceOpInstance>
-    static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream)
-    {
-        std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-        for(std::size_t i = 0; i < param.num_batches; i++)
-        {
-            int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-            int N = param.host_seqlen_k.empty()
-                        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-                        : param.host_seqlen_k[i];
-            int K    = param.K;
-            int Kv   = param.Kv;
-            int G1q  = param.Hq;
-            int G1kv = param.Hkv;
-
-            std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
-            std::vector<ck::index_t> a_gs_ms_ks_strides{
-                0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-            std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
-            std::vector<ck::index_t> b0_gs_ns_ks_strides{
-                0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-            // to be changed to b1_gs_ns_os_lengths
-            std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
-            std::vector<ck::index_t> b1_gs_os_ns_strides{
-                0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-            std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
-            std::vector<ck::index_t> c_gs_ms_os_strides{
-                0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-            std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
-            std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-            std::vector<ck::index_t> d_gs_ms_ns_lengths;
-            std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-            if constexpr(has_attn_bias)
-            {
-                d_gs_ms_ns_lengths = {1, G1q, M, N};
-                d_gs_ms_ns_strides = {0,
-                                      param.attn_bias_strides[0],
-                                      param.attn_bias_strides[1],
-                                      param.attn_bias_strides[2]};
-            }
-            else
-            {
-                d_gs_ms_ns_lengths = {1, 1, 1, 1};
-                d_gs_ms_ns_strides = {0, 0, 0, 0};
-            };
-
-            problem_descs.push_back({a_gs_ms_ks_lengths,
-                                     a_gs_ms_ks_strides,
-                                     b0_gs_ns_ks_lengths,
-                                     b0_gs_ns_ks_strides,
-                                     b1_gs_os_ns_lengths,
-                                     b1_gs_os_ns_strides,
-                                     c_gs_ms_os_lengths,
-                                     c_gs_ms_os_strides,
-                                     {1, 1, 1, 1},
-                                     {0, 0, 0, 0},
-                                     lse_gs_ms_lengths,
-                                     lse_gs_ms_strides,
-                                     d_gs_ms_ns_lengths,
-                                     d_gs_ms_ns_strides,
-                                     {},   // acc1_bias_gs_ms_os_lengths
-                                     {}}); // acc1_bias_gs_ms_os_strides
-        }
-
-        float alpha = param.scale;
-
-        auto a_element_op    = AElementOp{};
-        auto b0_element_op   = B0ElementOp{};
-        auto acc0_element_op = Acc0ElementOp{alpha};
-        auto b1_element_op   = B1ElementOp{};
-        auto c_element_op    = CElementOp{};
-
-        auto op      = DeviceOpInstance{};
-        auto invoker = op.MakeInvoker();
-
-        auto arg_ptr = op.MakeArgumentPointer(
-            param.q_ptrs,
-            param.k_ptrs,
-            param.v_ptrs,
-            param.out_ptrs,
-            param.randvals_ptrs,
-            param.logsumexp_ptrs,
-            param.attn_bias_ptrs,
-            {}, // p_acc1_biases
-            problem_descs,
-            a_element_op,
-            b0_element_op,
-            acc0_element_op,
-            b1_element_op,
-            c_element_op,
-            param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-            std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-        auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-        SimpleDeviceMem workspace(sizeInBytes);
-
-        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-        if(!op.IsSupportedArgument(arg_ptr.get()))
-        {
-            std::ostringstream ostr;
-
-            ostr << op.GetTypeString() << " does not support this problem";
-
-            throw std::runtime_error(ostr.str());
-        }
-
-        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-    };
+  // clang-format on
+
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsGroupedForward::AK1 /
+        GemmOpConstantsGroupedForward::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsGroupedForward::BK1 /
+        GemmOpConstantsGroupedForward::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+        "ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(8, thread_slice_length_ak1);
+
+    GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsGroupedForward::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(2, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsGroupedForward::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
+    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+    for (std::size_t i = 0; i < param.num_batches; i++) {
+      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+      int N = param.host_seqlen_k.empty()
+          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+          : param.host_seqlen_k[i];
+      int K = param.K;
+      int Kv = param.Kv;
+      int G1q = param.Hq;
+      int G1kv = param.Hkv;
+
+      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
+      std::vector<ck::index_t> a_gs_ms_ks_strides{
+          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
+      std::vector<ck::index_t> b0_gs_ns_ks_strides{
+          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+      // to be changed to b1_gs_ns_os_lengths
+      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
+      std::vector<ck::index_t> b1_gs_os_ns_strides{
+          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
+      std::vector<ck::index_t> c_gs_ms_os_strides{
+          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
+      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
+
+      std::vector<ck::index_t> d_gs_ms_ns_lengths;
+      std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+      if constexpr (has_attn_bias) {
+        d_gs_ms_ns_lengths = {1, G1q, M, N};
+        d_gs_ms_ns_strides = {
+            0,
+            param.attn_bias_strides[0],
+            param.attn_bias_strides[1],
+            param.attn_bias_strides[2]};
+      } else {
+        d_gs_ms_ns_lengths = {1, 1, 1, 1};
+        d_gs_ms_ns_strides = {0, 0, 0, 0};
+      };
+
+      problem_descs.push_back(
+          {a_gs_ms_ks_lengths,
+           a_gs_ms_ks_strides,
+           b0_gs_ns_ks_lengths,
+           b0_gs_ns_ks_strides,
+           b1_gs_os_ns_lengths,
+           b1_gs_os_ns_strides,
+           c_gs_ms_os_lengths,
+           c_gs_ms_os_strides,
+           {1, 1, 1, 1},
+           {0, 0, 0, 0},
+           lse_gs_ms_lengths,
+           lse_gs_ms_strides,
+           d_gs_ms_ns_lengths,
+           d_gs_ms_ns_strides,
+           {}, // acc1_bias_gs_ms_os_lengths
+           {}}); // acc1_bias_gs_ms_os_strides
+    }
+
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptrs,
+        param.k_ptrs,
+        param.v_ptrs,
+        param.out_ptrs,
+        param.randvals_ptrs,
+        param.logsumexp_ptrs,
+        param.attn_bias_ptrs,
+        {}, // p_acc1_biases
+        problem_descs,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op,
+        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
+
+    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
+
+    SimpleDeviceMem workspace(sizeInBytes);
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_forward_masktype_attnbias_dispatched(GroupedForwardParams& param,
-                                                      hipStream_t stream)
-{
-    grouped_forward_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
-        param, stream);
+void run_grouped_forward_masktype_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_forward_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
index 6679f8731..72ebd715e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_forward.h"
 
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
-                                                                        hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
-                                                                        hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
-                                                                        hipStream_t stream);
-
-void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                            stream);
-        else if(param.custom_mask_type == 1)
-            run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                            stream);
-        else if(param.custom_mask_type == 2)
-            run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                            stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_grouped_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_grouped_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_grouped_forward_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
index 70a295cec..eb53ad433 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_forward.h"
 
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
-
-void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                           stream);
-        else if(param.custom_mask_type == 1)
-            run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                           stream);
-        else if(param.custom_mask_type == 2)
-            run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                           stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_grouped_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_grouped_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_grouped_forward_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
index 08e5434d7..fbc0b2b1a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
@@ -24,62 +24,59 @@
 #include "ck_fmha_params.h"
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_infer_masktype_attnbias_dispatched
-{
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-    using GemmDataType     = scalar_t;
-    using ADataType        = scalar_t;
-    using B0DataType       = scalar_t;
-    using B1DataType       = scalar_t;
-    using AccDataType      = F32;
-    using CShuffleDataType = F32;
-    using CDataType        = scalar_t;
-    using ZDataType        = unsigned short;
-    using LSEDataType      = F32;
-    using Acc0BiasDataType = typename std::conditional<has_attn_bias, scalar_t, void>::type;
-    using Acc1BiasDataType = void;
-
-    using AElementOp    = PassThrough;
-    using B0ElementOp   = PassThrough;
-    using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-    using B1ElementOp   = PassThrough;
-    using CElementOp    = PassThrough;
-
-    static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-    static constexpr auto MaskingSpec =
-        static_cast<ck::tensor_operation::device::MaskingSpecialization>(custom_mask_type);
-
-    static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
+struct grouped_infer_masktype_attnbias_dispatched {
+  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+  using GemmDataType = scalar_t;
+  using ADataType = scalar_t;
+  using B0DataType = scalar_t;
+  using B1DataType = scalar_t;
+  using AccDataType = F32;
+  using CShuffleDataType = F32;
+  using CDataType = scalar_t;
+  using ZDataType = unsigned short;
+  using LSEDataType = F32;
+  using Acc0BiasDataType =
+      typename std::conditional<has_attn_bias, scalar_t, void>::type;
+  using Acc1BiasDataType = void;
+
+  using AElementOp = PassThrough;
+  using B0ElementOp = PassThrough;
+  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+  using B1ElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+  static constexpr auto GemmSpec =
+      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+  static constexpr auto MaskingSpec =
+      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
+          custom_mask_type);
+
+  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
 
 #ifndef GROUPED_INFER_HEADDIM_SWITCH
-#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...)         \
-    [&] {                                                               \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                          \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 32;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 1;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                     \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 64;  \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 2;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-        else                                                            \
-        {                                                               \
-            constexpr ck::index_t kGemm1NPerBlock                = 128; \
-            constexpr ck::index_t kGemm1NXdlPerWave              = 4;   \
-            constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-            __VA_ARGS__();                                              \
-        }                                                               \
-    }()
+#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
+  [&] {                                                         \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
+      constexpr ck::index_t kGemm1NPerBlock = 32;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
+      __VA_ARGS__();                                            \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
+      constexpr ck::index_t kGemm1NPerBlock = 64;               \
+      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
+      __VA_ARGS__();                                            \
+    } else {                                                    \
+      constexpr ck::index_t kGemm1NPerBlock = 128;              \
+      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
+      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
+      __VA_ARGS__();                                            \
+    }                                                           \
+  }()
 #endif
 
-    // clang-format off
+  // clang-format off
   template <
       ck::index_t kGemm1NPerBlock,
       ck::index_t kGemm1NXdlPerWave,
@@ -153,206 +150,210 @@ struct grouped_infer_masktype_attnbias_dispatched
           GemmOpConstantsGroupedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           kCShuffleBlockTransferScalarPerVector,
           MaskingSpec>;
-    // clang-format on
-
-    static constexpr auto I1 = ck::Number<1>{};
-    static constexpr auto I2 = ck::Number<2>{};
-    static constexpr auto I3 = ck::Number<3>{};
-
-    static void Run(GroupedForwardParams& param, hipStream_t stream)
-    {
-        using ck::math::min;
-
-        // compile-time constants which don't depend on head-dim switching
-        constexpr ck::index_t thread_slice_length_ak1 =
-            GemmOpConstantsGroupedInfer::AK1 /
-            GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-        constexpr ck::index_t thread_slice_length_bk1 =
-            GemmOpConstantsGroupedInfer::BK1 /
-            GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-        static_assert(thread_slice_length_ak1 == thread_slice_length_bk1,
-                      "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-                      "ThreadClusterLengths!");
-
-        constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-            min(8, thread_slice_length_ak1);
-
-        GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-            constexpr ck::index_t thread_slice_length_gemm1n =
-                kGemm1NPerBlock /
-                GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-            constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-                min(4, thread_slice_length_gemm1n);
-
-            constexpr ck::index_t thread_slice_length_cshuflle_n =
-                (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock / kGemm1NXdlPerWave) /
-                GemmOpConstantsGroupedInfer::
-                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::At(I3);
-
-            constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-                min(4, thread_slice_length_cshuflle_n);
-
-            if constexpr(kB1BlockTransferSrcScalarPerVector_max >=
-                         kCShuffleBlockTransferScalarPerVector_max)
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kB1BlockTransferSrcScalarPerVector_max,
-                               kB1BlockTransferSrcScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                                       min(kB1BlockTransferSrcScalarPerVector,
-                                           kCShuffleBlockTransferScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            }
-            else
-            {
-                ALIGN_SWITCH_2(kABBlockTransferSrcScalarPerVector_max,
-                               kABBlockTransferSrcScalarPerVector,
-                               param.K,
-                               kCShuffleBlockTransferScalarPerVector_max,
-                               kCShuffleBlockTransferScalarPerVector,
-                               param.Kv,
-                               [&] {
-                                   constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                                       min(kCShuffleBlockTransferScalarPerVector,
-                                           kB1BlockTransferSrcScalarPerVector_max);
-                                   using DeviceOpInstance =
-                                       DeviceOpInstanceTemp<kGemm1NPerBlock,
-                                                            kGemm1NXdlPerWave,
-                                                            kCShuffleNXdlPerWavePerShuffle,
-                                                            kABBlockTransferSrcScalarPerVector,
-                                                            kB1BlockTransferSrcScalarPerVector,
-                                                            kCShuffleBlockTransferScalarPerVector>;
-
-                                   RunWithDeviceOp<DeviceOpInstance>(param, stream);
-                               });
-            };
-        });
-    };
-
-    template <typename DeviceOpInstance>
-    static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream)
-    {
-        std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-        for(std::size_t i = 0; i < param.num_batches; i++)
-        {
-            int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-            int N = param.host_seqlen_k.empty()
-                        ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-                        : param.host_seqlen_k[i];
-            int K    = param.K;
-            int Kv   = param.Kv;
-            int G1q  = param.Hq;
-            int G1kv = param.Hkv;
-
-            std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
-            std::vector<ck::index_t> a_gs_ms_ks_strides{
-                0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-            std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
-            std::vector<ck::index_t> b0_gs_ns_ks_strides{
-                0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-            // to be changed to b1_gs_ns_os_lengths
-            std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
-            std::vector<ck::index_t> b1_gs_os_ns_strides{
-                0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-            std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
-            std::vector<ck::index_t> c_gs_ms_os_strides{
-                0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-            std::vector<ck::index_t> d_gs_ms_ns_lengths;
-            std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-            if constexpr(has_attn_bias)
-            {
-                d_gs_ms_ns_lengths = {1, G1q, M, N};
-                d_gs_ms_ns_strides = {0,
-                                      param.attn_bias_strides[0],
-                                      param.attn_bias_strides[1],
-                                      param.attn_bias_strides[2]};
-            }
-            else
-            {
-                d_gs_ms_ns_lengths = {1, 1, 1, 1};
-                d_gs_ms_ns_strides = {0, 0, 0, 0};
-            };
-
-            problem_descs.push_back({a_gs_ms_ks_lengths,
-                                     a_gs_ms_ks_strides,
-                                     b0_gs_ns_ks_lengths,
-                                     b0_gs_ns_ks_strides,
-                                     b1_gs_os_ns_lengths,
-                                     b1_gs_os_ns_strides,
-                                     c_gs_ms_os_lengths,
-                                     c_gs_ms_os_strides,
-                                     d_gs_ms_ns_lengths,
-                                     d_gs_ms_ns_strides,
-                                     {},   // acc1_bias_gs_ms_os_lengths
-                                     {}}); // acc1_bias_gs_ms_os_strides
-        }
-
-        float alpha = param.scale;
-
-        auto a_element_op    = AElementOp{};
-        auto b0_element_op   = B0ElementOp{};
-        auto acc0_element_op = Acc0ElementOp{alpha};
-        auto b1_element_op   = B1ElementOp{};
-        auto c_element_op    = CElementOp{};
-
-        auto op      = DeviceOpInstance{};
-        auto invoker = op.MakeInvoker();
-
-        auto arg_ptr = op.MakeArgumentPointer(param.q_ptrs,
-                                              param.k_ptrs,
-                                              param.v_ptrs,
-                                              param.out_ptrs,
-                                              param.attn_bias_ptrs,
-                                              {}, // p_acc1_biases
-                                              problem_descs,
-                                              a_element_op,
-                                              b0_element_op,
-                                              acc0_element_op,
-                                              b1_element_op,
-                                              c_element_op);
-
-        auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-        SimpleDeviceMem workspace(sizeInBytes);
-
-        op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-        if(!op.IsSupportedArgument(arg_ptr.get()))
-        {
-            std::ostringstream ostr;
-
-            ostr << op.GetTypeString() << " does not support this problem";
-
-            throw std::runtime_error(ostr.str());
-        }
-
-        (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-    };
+  // clang-format on
+
+  static constexpr auto I1 = ck::Number<1>{};
+  static constexpr auto I2 = ck::Number<2>{};
+  static constexpr auto I3 = ck::Number<3>{};
+
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    using ck::math::min;
+
+    // compile-time constants which don't depend on head-dim switching
+    constexpr ck::index_t thread_slice_length_ak1 =
+        GemmOpConstantsGroupedInfer::AK1 /
+        GemmOpConstantsGroupedInfer::
+            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
+    constexpr ck::index_t thread_slice_length_bk1 =
+        GemmOpConstantsGroupedInfer::BK1 /
+        GemmOpConstantsGroupedInfer::
+            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
+
+    static_assert(
+        thread_slice_length_ak1 == thread_slice_length_bk1,
+        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
+        "ThreadClusterLengths!");
+
+    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
+        min(8, thread_slice_length_ak1);
+
+    GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
+      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
+          GemmOpConstantsGroupedInfer::
+              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
+      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
+          min(4, thread_slice_length_gemm1n);
+
+      constexpr ck::index_t thread_slice_length_cshuflle_n =
+          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
+           kGemm1NXdlPerWave) /
+          GemmOpConstantsGroupedInfer::
+              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
+                  At(I3);
+
+      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
+          min(4, thread_slice_length_cshuflle_n);
+
+      if constexpr (
+          kB1BlockTransferSrcScalarPerVector_max >=
+          kCShuffleBlockTransferScalarPerVector_max) {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kB1BlockTransferSrcScalarPerVector_max,
+            kB1BlockTransferSrcScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
+                  min(kB1BlockTransferSrcScalarPerVector,
+                      kCShuffleBlockTransferScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      } else {
+        ALIGN_SWITCH_2(
+            kABBlockTransferSrcScalarPerVector_max,
+            kABBlockTransferSrcScalarPerVector,
+            param.K,
+            kCShuffleBlockTransferScalarPerVector_max,
+            kCShuffleBlockTransferScalarPerVector,
+            param.Kv,
+            [&] {
+              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
+                  min(kCShuffleBlockTransferScalarPerVector,
+                      kB1BlockTransferSrcScalarPerVector_max);
+              using DeviceOpInstance = DeviceOpInstanceTemp<
+                  kGemm1NPerBlock,
+                  kGemm1NXdlPerWave,
+                  kCShuffleNXdlPerWavePerShuffle,
+                  kABBlockTransferSrcScalarPerVector,
+                  kB1BlockTransferSrcScalarPerVector,
+                  kCShuffleBlockTransferScalarPerVector>;
+
+              RunWithDeviceOp<DeviceOpInstance>(param, stream);
+            });
+      };
+    });
+  };
+
+  template <typename DeviceOpInstance>
+  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
+    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
+
+    for (std::size_t i = 0; i < param.num_batches; i++) {
+      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
+      int N = param.host_seqlen_k.empty()
+          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
+          : param.host_seqlen_k[i];
+      int K = param.K;
+      int Kv = param.Kv;
+      int G1q = param.Hq;
+      int G1kv = param.Hkv;
+
+      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
+      std::vector<ck::index_t> a_gs_ms_ks_strides{
+          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
+
+      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
+      std::vector<ck::index_t> b0_gs_ns_ks_strides{
+          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
+
+      // to be changed to b1_gs_ns_os_lengths
+      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
+      std::vector<ck::index_t> b1_gs_os_ns_strides{
+          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
+
+      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
+      std::vector<ck::index_t> c_gs_ms_os_strides{
+          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
+
+      std::vector<ck::index_t> d_gs_ms_ns_lengths;
+      std::vector<ck::index_t> d_gs_ms_ns_strides;
+
+      if constexpr (has_attn_bias) {
+        d_gs_ms_ns_lengths = {1, G1q, M, N};
+        d_gs_ms_ns_strides = {
+            0,
+            param.attn_bias_strides[0],
+            param.attn_bias_strides[1],
+            param.attn_bias_strides[2]};
+      } else {
+        d_gs_ms_ns_lengths = {1, 1, 1, 1};
+        d_gs_ms_ns_strides = {0, 0, 0, 0};
+      };
+
+      problem_descs.push_back(
+          {a_gs_ms_ks_lengths,
+           a_gs_ms_ks_strides,
+           b0_gs_ns_ks_lengths,
+           b0_gs_ns_ks_strides,
+           b1_gs_os_ns_lengths,
+           b1_gs_os_ns_strides,
+           c_gs_ms_os_lengths,
+           c_gs_ms_os_strides,
+           d_gs_ms_ns_lengths,
+           d_gs_ms_ns_strides,
+           {}, // acc1_bias_gs_ms_os_lengths
+           {}}); // acc1_bias_gs_ms_os_strides
+    }
+
+    float alpha = param.scale;
+
+    auto a_element_op = AElementOp{};
+    auto b0_element_op = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op = B1ElementOp{};
+    auto c_element_op = CElementOp{};
+
+    auto op = DeviceOpInstance{};
+    auto invoker = op.MakeInvoker();
+
+    auto arg_ptr = op.MakeArgumentPointer(
+        param.q_ptrs,
+        param.k_ptrs,
+        param.v_ptrs,
+        param.out_ptrs,
+        param.attn_bias_ptrs,
+        {}, // p_acc1_biases
+        problem_descs,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op);
+
+    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
+
+    SimpleDeviceMem workspace(sizeInBytes);
+
+    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
+
+    if (!op.IsSupportedArgument(arg_ptr.get())) {
+      std::ostringstream ostr;
+
+      ostr << op.GetTypeString() << " does not support this problem";
+
+      throw std::runtime_error(ostr.str());
+    }
+
+    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
+  };
 };
 
 template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_infer_masktype_attnbias_dispatched(GroupedForwardParams& param, hipStream_t stream)
-{
-    grouped_infer_masktype_attnbias_dispatched<scalar_t, custom_mask_type, has_attn_bias>::Run(
-        param, stream);
+void run_grouped_infer_masktype_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_infer_masktype_attnbias_dispatched<
+      scalar_t,
+      custom_mask_type,
+      has_attn_bias>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
index 5d91ad4a1..ef1014398 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_infer.h"
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
-
-void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
-        else if(param.custom_mask_type == 1)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
-        else if(param.custom_mask_type == 2)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                          stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::bhalf_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
index cd7dbb977..7fa075c85 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
@@ -10,43 +10,54 @@
 #include "ck_bool_switch.h"
 #include "ck_fmha_grouped_infer.h"
 
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
-
-extern template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
-
-void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        if(param.custom_mask_type == 0)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
-        else if(param.custom_mask_type == 1)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
-        else if(param.custom_mask_type == 2)
-            run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, HAS_ATTN_BIAS>(param,
-                                                                                         stream);
-        else
-            throw std::runtime_error("Invalid custom_mask_type value");
-    });
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
+
+void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    if (param.custom_mask_type == 0)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          0,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 1)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          1,
+          HAS_ATTN_BIAS>(param, stream);
+    else if (param.custom_mask_type == 2)
+      run_grouped_infer_masktype_attnbias_dispatched<
+          ck::half_t,
+          2,
+          HAS_ATTN_BIAS>(param, stream);
+    else
+      throw std::runtime_error("Invalid custom_mask_type value");
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
index f9cd1a49c..24ab800e9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
@@ -13,34 +13,33 @@
 #include <ck/utility/data_type.hpp>
 
 template <typename scalar_t>
-struct MaxVectorSizeForType
-{
-    static constexpr int value = 4;
+struct MaxVectorSizeForType {
+  static constexpr int value = 4;
 };
 
 template <>
-struct MaxVectorSizeForType<ck::half_t>
-{
-    static constexpr int value = 8;
+struct MaxVectorSizeForType<ck::half_t> {
+  static constexpr int value = 8;
 };
 
 template <>
-struct MaxVectorSizeForType<ck::bhalf_t>
-{
-    static constexpr int value = 8;
+struct MaxVectorSizeForType<ck::bhalf_t> {
+  static constexpr int value = 8;
 };
 
-struct SimpleDeviceMem
-{
-    SimpleDeviceMem() = delete;
-    SimpleDeviceMem(size_t sizeInBytes)
-    {
-        pData_ = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
-    }
-    void* GetDeviceBuffer() { return pData_; }
-    ~SimpleDeviceMem() { c10::cuda::HIPCachingAllocator::raw_delete(pData_); }
-
-    void* pData_;
+struct SimpleDeviceMem {
+  SimpleDeviceMem() = delete;
+  SimpleDeviceMem(size_t sizeInBytes) {
+    pData_ = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
+  }
+  void* GetDeviceBuffer() {
+    return pData_;
+  }
+  ~SimpleDeviceMem() {
+    c10::cuda::HIPCachingAllocator::raw_delete(pData_);
+  }
+
+  void* pData_;
 };
 
 // useful aliasing for making the codes easy
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
index a741d28b9..918126591 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
@@ -9,210 +9,204 @@
 #include <array>
 #include <cstdint>
 
-struct BatchedInferParams
-{
-    int B;   // batch size
-    int M;   // seq_len for Query
-    int N;   // seq_len for Key and Value
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    float scale;
-    bool has_attn_bias;
-
-    // BMHK mode strides
-    std::array<int, 4> q_strides;
-    std::array<int, 4> k_strides;
-    std::array<int, 4> v_strides;
-    std::array<int, 4> out_strides;
-    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    const void* attn_bias_ptr;
-
-    uint8_t custom_mask_type;
-
-    void* out_ptr;
+struct BatchedInferParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+
+  // BMHK mode strides
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> out_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* out_ptr;
 };
 
-struct BatchedForwardParams : public BatchedInferParams
-{
-    bool use_dropout;
-    bool compute_logsumexp;
+struct BatchedForwardParams : public BatchedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
 
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
-    // completely contiguous
-    void* logsumexp_ptr;
+  // completely contiguous
+  void* logsumexp_ptr;
 };
 
-struct GroupedInferParams
-{
-    int num_batches;
-    int M;   // total seq_len for all queries in the batch
-    int N;   // total seq_len for all keys/values in the batch
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    int max_seqlen_q;
-
-    std::vector<int> host_seqstart_q;
-    std::vector<int> host_seqstart_k;
-    std::vector<int> host_seqlen_k;
-
-    float scale;
-    bool has_attn_bias;
-
-    // MHK mode strides, last-dim contiguous
-    std::array<int, 3> q_strides;
-    std::array<int, 3> k_strides;
-    std::array<int, 3> v_strides;
-    std::array<int, 3> out_strides;
-
-    // 4d tensor view [B, H, M, N]
-    std::array<int, 4> attn_bias_strides;
-
-    std::vector<const void*> q_ptrs;
-    std::vector<const void*> k_ptrs;
-    std::vector<const void*> v_ptrs;
-    std::vector<const void*> attn_bias_ptrs;
-    std::vector<void*> out_ptrs;
-
-    uint8_t custom_mask_type;
+struct GroupedInferParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  int max_seqlen_q;
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+  bool has_attn_bias;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<void*> out_ptrs;
+
+  uint8_t custom_mask_type;
 };
 
-struct GroupedForwardParams : public GroupedInferParams
-{
-    bool use_dropout;
-    bool compute_logsumexp;
+struct GroupedForwardParams : public GroupedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
 
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
-    // completely contiguous
-    std::vector<void*> logsumexp_ptrs;
+  // completely contiguous
+  std::vector<void*> logsumexp_ptrs;
 
-    // TODO: need remove this after dev-op fix
-    std::vector<void*> randvals_ptrs;
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
 };
 
-struct BatchedBackwardParams
-{
-    int B;   // batch size
-    int M;   // seq_len for Query
-    int N;   // seq_len for Key and Value
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    float scale;
-    bool has_attn_bias;
-    bool bias_has_grad;
-
-    bool use_fp32_qkv_grad;
-    bool is_mqa_gqa;
-
-    // BMHK mode strides, last-dim contiguous
-    std::array<int, 4> q_strides;
-    std::array<int, 4> k_strides;
-    std::array<int, 4> v_strides;
-    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-    std::array<int, 4> out_strides;
-
-    std::array<int, 4> tmp_grad_k_strides;
-    std::array<int, 4> tmp_grad_v_strides;
-
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    const void* attn_bias_ptr;
-    const void* grad_out_ptr;
-    const void* out_ptr;
-
-    uint8_t custom_mask_type;
-
-    void* grad_q_ptr;
-    void* grad_k_ptr;
-    void* grad_v_ptr;
-    void* grad_bias_ptr;
-
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
-
-    // BHM mode lengths, completely contiguous
-    const void* logsumexp_ptr;
+struct BatchedBackwardParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+  bool bias_has_grad;
+
+  bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
+
+  // BMHK mode strides, last-dim contiguous
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+  std::array<int, 4> out_strides;
+
+  std::array<int, 4> tmp_grad_k_strides;
+  std::array<int, 4> tmp_grad_v_strides;
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+  const void* grad_out_ptr;
+  const void* out_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* grad_q_ptr;
+  void* grad_k_ptr;
+  void* grad_v_ptr;
+  void* grad_bias_ptr;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // BHM mode lengths, completely contiguous
+  const void* logsumexp_ptr;
 };
 
-struct GroupedBackwardParams
-{
-    int num_batches;
-    int M;   // total seq_len for all queries in the batch
-    int N;   // total seq_len for all keys/values in the batch
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    int max_seqlen_q;
-
-    std::vector<int> host_seqstart_q;
-    std::vector<int> host_seqstart_k;
-    std::vector<int> host_seqlen_k;
-
-    float scale;
-    bool has_attn_bias;
-    bool bias_has_grad;
-
-    bool use_fp32_qkv_grad;
-    bool is_mqa_gqa;
-
-    // MHK mode strides, last-dim contiguous
-    std::array<int, 3> q_strides;
-    std::array<int, 3> k_strides;
-    std::array<int, 3> v_strides;
-    std::array<int, 3> out_strides;
-    // 4d tensor view [B, H, M, N]
-    std::array<int, 4> attn_bias_strides;
-
-    std::array<int, 3> tmp_grad_k_strides;
-    std::array<int, 3> tmp_grad_v_strides;
-
-    std::vector<const void*> q_ptrs;
-    std::vector<const void*> k_ptrs;
-    std::vector<const void*> v_ptrs;
-    std::vector<const void*> attn_bias_ptrs;
-    std::vector<const void*> grad_out_ptrs;
-    std::vector<const void*> out_ptrs;
-
-    // used by the light_v2 kernel
-    // TODO use these as workspace
-    std::vector<void*> ydotdy_ptrs;
-
-    uint8_t custom_mask_type;
-
-    std::vector<void*> grad_q_ptrs;
-    std::vector<void*> grad_k_ptrs;
-    std::vector<void*> grad_v_ptrs;
-    std::vector<void*> grad_bias_ptrs;
-
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
-
-    // BHM mode lengths, completely contiguous
-    std::vector<const void*> logsumexp_ptrs;
-
-    // TODO: need remove this after dev-op fix
-    std::vector<void*> randvals_ptrs;
+struct GroupedBackwardParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  int max_seqlen_q;
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+  bool has_attn_bias;
+  bool bias_has_grad;
+
+  bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::array<int, 3> tmp_grad_k_strides;
+  std::array<int, 3> tmp_grad_v_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<const void*> grad_out_ptrs;
+  std::vector<const void*> out_ptrs;
+
+  // used by the light_v2 kernel
+  // TODO use these as workspace
+  std::vector<void*> ydotdy_ptrs;
+
+  uint8_t custom_mask_type;
+
+  std::vector<void*> grad_q_ptrs;
+  std::vector<void*> grad_k_ptrs;
+  std::vector<void*> grad_v_ptrs;
+  std::vector<void*> grad_bias_ptrs;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // BHM mode lengths, completely contiguous
+  std::vector<const void*> logsumexp_ptrs;
+
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
index 571b206fa..f97c8dd66 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
@@ -11,29 +11,31 @@
 namespace {
 
 // For testing xFormers building and binding
-bool is_ck_fmha_available(double val)
-{
-    std::cout << "ck fmha is really here, val=" << val << std::endl;
-    return (true);
+bool is_ck_fmha_available(double val) {
+  std::cout << "ck fmha is really here, val=" << val << std::endl;
+  return (true);
 };
 
 // For checking if ck-tiled kernel is used
-bool is_ck_tiled_used()
-{
+bool is_ck_tiled_used() {
 #if defined(USE_CK_TILED_KERNEL)
-    return (true);
+  return (true);
 #else
-    return (false);
+  return (false);
 #endif
 };
 
 } // namespace
 
-TORCH_LIBRARY_FRAGMENT(xformers, m)
-{
-    m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_fmha_available(float val) -> bool"));
-    m.impl(TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"), TORCH_FN(is_ck_fmha_available));
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::is_ck_fmha_available(float val) -> bool"));
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
+      TORCH_FN(is_ck_fmha_available));
 
-    m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_tiled_used() -> bool"));
-    m.impl(TORCH_SELECTIVE_NAME("xformers::is_ck_tiled_used"), TORCH_FN(is_ck_tiled_used));
+  m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_tiled_used() -> bool"));
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::is_ck_tiled_used"),
+      TORCH_FN(is_ck_tiled_used));
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index 8f26e4cee..a6ea50d78 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -17,114 +17,99 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
 
-#define XFORMERS_CHECK(COND, ERR)              \
-    if(!(COND))                                \
-    {                                          \
-        std::ostringstream ostr;               \
-        ostr << "'" #COND "' failed: " << ERR; \
-        throw std::runtime_error(ostr.str());  \
-    }
-
-#define DISPATCH_TYPES(InDataType, func)                                                  \
-    {                                                                                     \
-        if(InDataType == at::ScalarType::Half)                                            \
-        {                                                                                 \
-            using scalar_t = ck::half_t;                                                  \
-            func();                                                                       \
-        }                                                                                 \
-        else if(InDataType == at::ScalarType::BFloat16)                                   \
-        {                                                                                 \
-            using scalar_t = ck::bhalf_t;                                                 \
-            func();                                                                       \
-        }                                                                                 \
-        else                                                                              \
-        {                                                                                 \
-            XFORMERS_CHECK(false, "Only half & bf16 input type supported at the moment"); \
-        }                                                                                 \
-    }
+#define XFORMERS_CHECK(COND, ERR)          \
+  if (!(COND)) {                           \
+    std::ostringstream ostr;               \
+    ostr << "'" #COND "' failed: " << ERR; \
+    throw std::runtime_error(ostr.str());  \
+  }
+
+#define DISPATCH_TYPES(InDataType, func)                                 \
+  {                                                                      \
+    if (InDataType == at::ScalarType::Half) {                            \
+      using scalar_t = ck::half_t;                                       \
+      func();                                                            \
+    } else if (InDataType == at::ScalarType::BFloat16) {                 \
+      using scalar_t = ck::bhalf_t;                                      \
+      func();                                                            \
+    } else {                                                             \
+      XFORMERS_CHECK(                                                    \
+          false, "Only half & bf16 input type supported at the moment"); \
+    }                                                                    \
+  }
 
 template <typename scalar_t>
 struct CkToAtenDtype;
 
 template <>
-struct CkToAtenDtype<ck::half_t>
-{
-    using scalar_t = ck::half_t;
+struct CkToAtenDtype<ck::half_t> {
+  using scalar_t = ck::half_t;
 
-    static constexpr __host__ at::ScalarType atScalarType() { return at::ScalarType::Half; }
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Half;
+  }
 };
 
 template <>
-struct CkToAtenDtype<ck::bhalf_t>
-{
-    using scalar_t = ck::bhalf_t;
+struct CkToAtenDtype<ck::bhalf_t> {
+  using scalar_t = ck::bhalf_t;
 
-    static constexpr __host__ at::ScalarType atScalarType() { return at::ScalarType::BFloat16; }
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::BFloat16;
+  }
 };
 
 template <>
-struct CkToAtenDtype<float>
-{
-    using scalar_t = float;
+struct CkToAtenDtype<float> {
+  using scalar_t = float;
 
-    static constexpr __host__ at::ScalarType atScalarType() { return at::ScalarType::Float; }
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Float;
+  }
 };
 
-#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                              \
-    XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
-    XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
-    XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
-
-#define CHECK_NOSPARSE_CONTIGUOUS_CPU(TENSOR)                               \
-    XFORMERS_CHECK(TENSOR.is_cpu(), #TENSOR " must be a CPU tensor");       \
-    XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
-    XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
-
-#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                          \
-    XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
-    XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
-    XFORMERS_CHECK(TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
-
-#define HIP_CALL_CHECK(flag)                                                       \
-    do                                                                             \
-    {                                                                              \
-        hipError_t _tmpVal;                                                        \
-        if((_tmpVal = flag) != hipSuccess)                                         \
-        {                                                                          \
-            std::ostringstream ostr;                                               \
-            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
-                 << hipGetErrorString(_tmpVal);                                    \
-            throw std::runtime_error(ostr.str());                                  \
-        }                                                                          \
-    } while(0)
-
-static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype)
-{
-    if(dtype == at::ScalarType::Float)
-    {
-        return n * 4;
-    }
-    else if(dtype == at::ScalarType::Half)
-    {
-        return n * 2;
-    }
-    else if(dtype == at::ScalarType::BFloat16)
-    {
-        return n * 2;
-    }
-    else if(dtype == at::ScalarType::Short)
-    {
-        return n * 2;
-    }
-    else if(dtype == at::ScalarType::Int)
-    {
-        return n * 4;
-    }
-    else if(dtype == at::ScalarType::Byte)
-    {
-        return n;
-    }
-    return 0;
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CPU(TENSOR)                             \
+  XFORMERS_CHECK(TENSOR.is_cpu(), #TENSOR " must be a CPU tensor");       \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous(), #TENSOR " must be contiguous");
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#define HIP_CALL_CHECK(flag)                                                 \
+  do {                                                                       \
+    hipError_t _tmpVal;                                                      \
+    if ((_tmpVal = flag) != hipSuccess) {                                    \
+      std::ostringstream ostr;                                               \
+      ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+           << hipGetErrorString(_tmpVal);                                    \
+      throw std::runtime_error(ostr.str());                                  \
+    }                                                                        \
+  } while (0)
+
+static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype) {
+  if (dtype == at::ScalarType::Float) {
+    return n * 4;
+  } else if (dtype == at::ScalarType::Half) {
+    return n * 2;
+  } else if (dtype == at::ScalarType::BFloat16) {
+    return n * 2;
+  } else if (dtype == at::ScalarType::Short) {
+    return n * 2;
+  } else if (dtype == at::ScalarType::Int) {
+    return n * 4;
+  } else if (dtype == at::ScalarType::Byte) {
+    return n;
+  }
+  return 0;
 }
 
 /**
@@ -138,27 +123,36 @@ static inline size_t get_size_in_bytes(size_t n, at::ScalarType dtype)
  * expand the bias as needed - be careful to only create a view with different
  * shape/strides, no copies allowed.
  */
-inline at::Tensor
-get_bias_4d_view(const at::Tensor& bias, int batch_sz, int n_heads, int n_queries, int n_keys)
-{
-    TORCH_CHECK(bias.size(-2) == n_queries,
-                "bias.size(-2) != n_queries: ",
-                bias.size(-2),
-                " != ",
-                n_queries);
-    TORCH_CHECK(
-        bias.size(-1) == n_keys, "bias.size(-1) != n_keys: ", bias.size(-1), " != ", n_keys);
-    switch(bias.dim())
-    {
+inline at::Tensor get_bias_4d_view(
+    const at::Tensor& bias,
+    int batch_sz,
+    int n_heads,
+    int n_queries,
+    int n_keys) {
+  TORCH_CHECK(
+      bias.size(-2) == n_queries,
+      "bias.size(-2) != n_queries: ",
+      bias.size(-2),
+      " != ",
+      n_queries);
+  TORCH_CHECK(
+      bias.size(-1) == n_keys,
+      "bias.size(-1) != n_keys: ",
+      bias.size(-1),
+      " != ",
+      n_keys);
+  switch (bias.dim()) {
     case 2: // (n_queries, n_keys) - broadcast across all batches and heads
-        return bias.unsqueeze(0).unsqueeze(0).expand({batch_sz, n_heads, n_queries, n_keys});
+      return bias.unsqueeze(0).unsqueeze(0).expand(
+          {batch_sz, n_heads, n_queries, n_keys});
     case 3: // (batch_sz * n_heads, n_queries, n_keys) - just reshape
-        TORCH_CHECK(bias.size(0) == batch_sz * n_heads);
-        return bias.view({batch_sz, n_heads, n_queries, n_keys});
+      TORCH_CHECK(bias.size(0) == batch_sz * n_heads);
+      return bias.view({batch_sz, n_heads, n_queries, n_keys});
     case 4: // (batch_sz, n_heads, n_queries, n_keys) - do nothing
-        TORCH_CHECK(bias.size(0) == batch_sz);
-        TORCH_CHECK(bias.size(1) == n_heads)
-        return bias;
-    default: TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
-    }
+      TORCH_CHECK(bias.size(0) == batch_sz);
+      TORCH_CHECK(bias.size(1) == n_heads)
+      return bias;
+    default:
+      TORCH_CHECK(false, "bias can only have ndims in {2, 3, 4}");
+  }
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index fd0f05b9d..8cdba0763 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -10,203 +10,224 @@
 #include <sstream>
 #include <stdexcept>
 
-#include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
 #include <ck/tensor_description/cluster_descriptor.hpp>
 #include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/tensor/tensor_view.hpp>
+#include <ck/utility/common_header.hpp>
 
+#include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
-#include <ck/tile_program/block_tile/block_masking.hpp>
 
+#include "ck_tiled_fmha_definitions.h"
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
-#include "ck_tiled_fmha_definitions.h"
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_headdim_switch.h"
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-struct batched_forward_causalmask_attnbias_dispatched
-{
-    using FmhaEpilogue =
-        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
-    template <typename FmhaTraits, typename FmhaMask>
-    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
-        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        FmhaFwdShape<HDim>,
-        false, // kIsGroupMode
-        FmhaMask,
-        FmhaTraits>;
-
-    static void Run(BatchedForwardParams& param, hipStream_t stream)
-    {
-        const bool has_local_attention = (param.window_size > 0) ? true : false;
-
-        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
-
-            using FmhaMask =
-                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
-
-            using FmhaShape                 = FmhaFwdShape<HDim>;
-            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
-
-            bool pad_seqlen_q  = !(param.M % FmhaShape::kM0 == 0);
-            bool pad_seqlen_k  = !(param.N % FmhaShape::kN0 == 0);
-            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
-
-            if constexpr(HDim == 256)
-            {
-                BOOL_SWITCH_4(
-                    pad_seqlen_q,
-                    kPadSeqLenQ,
-                    pad_seqlen_k,
-                    kPadSeqLenK,
-                    pad_headdim_q,
-                    kPadHeadDimQ,
-                    pad_headdim_v,
-                    kPadHeadDimV,
-                    [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                            kPadSeqLenK,
-                                                                            kPadHeadDimQ,
-                                                                            kPadHeadDimV,
-                                                                            has_attn_bias,
-                                                                            true, // kStoreLSE
-                                                                            occupancy>;
-
-                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                        using FmhaPipeline =
-                            ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
-                        using FmhaKernel =
-                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                        RunWithKernel<FmhaKernel>(param, stream);
-                    });
-            }
-            else
-            {
-                BOOL_SWITCH_4(
-                    pad_seqlen_q,
-                    kPadSeqLenQ,
-                    pad_seqlen_k,
-                    kPadSeqLenK,
-                    pad_headdim_q,
-                    kPadHeadDimQ,
-                    pad_headdim_v,
-                    kPadHeadDimV,
-                    [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                            kPadSeqLenK,
-                                                                            kPadHeadDimQ,
-                                                                            kPadHeadDimV,
-                                                                            has_attn_bias,
-                                                                            true, // kStoreLSE
-                                                                            occupancy>;
-
-                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                        constexpr bool no_any_padding =
-                            !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
-
-                        if constexpr(no_any_padding)
-                        {
-                            using FmhaPipeline =
-                                ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                                    FmhaPipelineProblem>;
-                            using FmhaKernel =
-                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                            RunWithKernel<FmhaKernel>(param, stream);
-                        }
-                        else
-                        {
-                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                                FmhaPipelineProblem>;
-                            using FmhaKernel =
-                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                            RunWithKernel<FmhaKernel>(param, stream);
-                        };
-                    });
-            };
-        });
-    };
-
-    template <typename FmhaKernel>
-    static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream)
-    {
-        const auto kargs = [&] {
-            return FmhaKernel::MakeKargs(
-                param.q_ptr,
-                param.k_ptr,
-                param.v_ptr,
-                param.attn_bias_ptr,
-                param.logsumexp_ptr,
-                param.out_ptr,
-                param.M,              // seqlen_q
-                param.N,              // seqlen_k
-                param.K,              // hdim_q
-                param.Kv,             // hdim_v
-                param.Hq / param.Hkv, // nhead_ratio_qk
-                param.scale,
-                param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
-                param.k_strides[1],
-                param.v_strides[1],
-                param.attn_bias_strides[2],
-                param.out_strides[1],
-                param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
-                param.k_strides[2],
-                param.v_strides[2],
-                param.attn_bias_strides[1],
-                param.M, // nhead_stride_lse
-                param.out_strides[2],
-                param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
-                param.k_strides[0],
-                param.v_strides[0],
-                param.attn_bias_strides[0],
-                param.Hq * param.M, // batch_stride_lse
-                param.out_strides[0],
-                static_cast<CausalMaskType>(param.custom_mask_type),
-                param.window_size);
-        }();
-
-        dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
-        constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
-
-        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
-    };
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+struct batched_forward_causalmask_attnbias_dispatched {
+  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
+
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaPipelineProblemTemp =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          FmhaFwdShape<HDim>,
+          false, // kIsGroupMode
+          FmhaMask,
+          FmhaTraits>;
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+    BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+      using FmhaMask = ck::tile_program::block::
+          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+      constexpr ck::index_t occupancy =
+          (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
+
+      bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
+      bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
+      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+
+      if constexpr (HDim == 256) {
+        BOOL_SWITCH_4(
+            pad_seqlen_q,
+            kPadSeqLenQ,
+            pad_seqlen_k,
+            kPadSeqLenK,
+            pad_headdim_q,
+            kPadHeadDimQ,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  true, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                      FmhaPipelineProblem>;
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      } else {
+        BOOL_SWITCH_4(
+            pad_seqlen_q,
+            kPadSeqLenQ,
+            pad_seqlen_k,
+            kPadSeqLenK,
+            pad_headdim_q,
+            kPadHeadDimQ,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  true, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              constexpr bool no_any_padding =
+                  !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
+
+              if constexpr (no_any_padding) {
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                        FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<
+                    FmhaTilePartitioner,
+                    FmhaPipeline,
+                    FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              } else {
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                        FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<
+                    FmhaTilePartitioner,
+                    FmhaPipeline,
+                    FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              };
+            });
+      };
+    });
+  };
+
+  template <typename FmhaKernel>
+  static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaKernel::MakeKargs(
+          param.q_ptr,
+          param.k_ptr,
+          param.v_ptr,
+          param.attn_bias_ptr,
+          param.logsumexp_ptr,
+          param.out_ptr,
+          param.M, // seqlen_q
+          param.N, // seqlen_k
+          param.K, // hdim_q
+          param.Kv, // hdim_v
+          param.Hq / param.Hkv, // nhead_ratio_qk
+          param.scale,
+          param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
+          param.k_strides[1],
+          param.v_strides[1],
+          param.attn_bias_strides[2],
+          param.out_strides[1],
+          param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
+          param.k_strides[2],
+          param.v_strides[2],
+          param.attn_bias_strides[1],
+          param.M, // nhead_stride_lse
+          param.out_strides[2],
+          param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
+          param.k_strides[0],
+          param.v_strides[0],
+          param.attn_bias_strides[0],
+          param.Hq * param.M, // batch_stride_lse
+          param.out_strides[0],
+          static_cast<CausalMaskType>(param.custom_mask_type),
+          param.window_size);
+    }();
+
+    dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
+    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  };
 };
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-void run_batched_forward_causalmask_attnbias_dispatched(BatchedForwardParams& param,
-                                                        hipStream_t stream)
-{
-    batched_forward_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
-        Run(param, stream);
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+void run_batched_forward_causalmask_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  batched_forward_causalmask_attnbias_dispatched<
+      scalar_t,
+      has_causal_mask,
+      has_attn_bias,
+      HDim>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
index 7bdf6cfd7..749c80a77 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -49,22 +49,23 @@ extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhal
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                   false,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                   true,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_batched_forward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_batched_forward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 05abf084e..c65f7fedc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -49,22 +49,23 @@ extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_batched_forward_causalmask_attnbias_dispatched<ck::half_t,
-                                                                   false,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_batched_forward_causalmask_attnbias_dispatched<ck::half_t,
-                                                                   true,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_batched_forward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_batched_forward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index d7af0af43..0d72fde9f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -10,203 +10,224 @@
 #include <sstream>
 #include <stdexcept>
 
-#include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
 #include <ck/tensor_description/cluster_descriptor.hpp>
 #include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/tensor/tensor_view.hpp>
+#include <ck/utility/common_header.hpp>
 
+#include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
-#include <ck/tile_program/block_tile/block_masking.hpp>
 
+#include "ck_tiled_fmha_definitions.h"
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
-#include "ck_tiled_fmha_definitions.h"
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_headdim_switch.h"
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-struct batched_infer_causalmask_attnbias_dispatched
-{
-    using FmhaEpilogue =
-        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
-    template <typename FmhaTraits, typename FmhaMask>
-    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
-        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        FmhaFwdShape<HDim>,
-        false, // kIsGroupMode
-        FmhaMask,
-        FmhaTraits>;
-
-    static void Run(BatchedForwardParams& param, hipStream_t stream)
-    {
-        const bool has_local_attention = (param.window_size > 0) ? true : false;
-
-        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
-
-            using FmhaMask =
-                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
-
-            using FmhaShape                 = FmhaFwdShape<HDim>;
-            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
-
-            bool pad_seqlen_q  = !(param.M % FmhaShape::kM0 == 0);
-            bool pad_seqlen_k  = !(param.N % FmhaShape::kN0 == 0);
-            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
-            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-
-            if constexpr(HDim == 256)
-            {
-                BOOL_SWITCH_4(
-                    pad_seqlen_q,
-                    kPadSeqLenQ,
-                    pad_seqlen_k,
-                    kPadSeqLenK,
-                    pad_headdim_q,
-                    kPadHeadDimQ,
-                    pad_headdim_v,
-                    kPadHeadDimV,
-                    [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                            kPadSeqLenK,
-                                                                            kPadHeadDimQ,
-                                                                            kPadHeadDimV,
-                                                                            has_attn_bias,
-                                                                            false, // kStoreLSE
-                                                                            occupancy>;
-
-                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                        using FmhaPipeline =
-                            ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
-                        using FmhaKernel =
-                            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                        RunWithKernel<FmhaKernel>(param, stream);
-                    });
-            }
-            else
-            {
-                BOOL_SWITCH_4(
-                    pad_seqlen_q,
-                    kPadSeqLenQ,
-                    pad_seqlen_k,
-                    kPadSeqLenK,
-                    pad_headdim_q,
-                    kPadHeadDimQ,
-                    pad_headdim_v,
-                    kPadHeadDimV,
-                    [&] {
-                        using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                            kPadSeqLenK,
-                                                                            kPadHeadDimQ,
-                                                                            kPadHeadDimV,
-                                                                            has_attn_bias,
-                                                                            false, // kStoreLSE
-                                                                            occupancy>;
-
-                        using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                        constexpr bool no_any_padding =
-                            !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
-
-                        if constexpr(no_any_padding)
-                        {
-                            using FmhaPipeline =
-                                ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                                    FmhaPipelineProblem>;
-                            using FmhaKernel =
-                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                            RunWithKernel<FmhaKernel>(param, stream);
-                        }
-                        else
-                        {
-                            using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                                FmhaPipelineProblem>;
-                            using FmhaKernel =
-                                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                            RunWithKernel<FmhaKernel>(param, stream);
-                        };
-                    });
-            };
-        });
-    };
-
-    template <typename FmhaKernel>
-    static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream)
-    {
-        const auto kargs = [&] {
-            return FmhaKernel::MakeKargs(
-                param.q_ptr,
-                param.k_ptr,
-                param.v_ptr,
-                param.attn_bias_ptr,
-                nullptr, // lse_ptr
-                param.out_ptr,
-                param.M,              // seqlen_q
-                param.N,              // seqlen_k
-                param.K,              // hdim_q
-                param.Kv,             // hdim_v
-                param.Hq / param.Hkv, // nhead_ratio_qk
-                param.scale,
-                param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
-                param.k_strides[1],
-                param.v_strides[1],
-                param.attn_bias_strides[2],
-                param.out_strides[1],
-                param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
-                param.k_strides[2],
-                param.v_strides[2],
-                param.attn_bias_strides[1],
-                0, // nhead_stride_lse
-                param.out_strides[2],
-                param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
-                param.k_strides[0],
-                param.v_strides[0],
-                param.attn_bias_strides[0],
-                0, // batch_stride_lse
-                param.out_strides[0],
-                static_cast<CausalMaskType>(param.custom_mask_type),
-                param.window_size);
-        }();
-
-        dim3 kGridSize            = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
-        constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
-
-        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
-    };
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+struct batched_infer_causalmask_attnbias_dispatched {
+  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
+
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaPipelineProblemTemp =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          FmhaFwdShape<HDim>,
+          false, // kIsGroupMode
+          FmhaMask,
+          FmhaTraits>;
+
+  static void Run(BatchedForwardParams& param, hipStream_t stream) {
+    const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+    BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+      using FmhaMask = ck::tile_program::block::
+          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+      constexpr ck::index_t occupancy =
+          (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
+
+      bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
+      bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
+      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+
+      if constexpr (HDim == 256) {
+        BOOL_SWITCH_4(
+            pad_seqlen_q,
+            kPadSeqLenQ,
+            pad_seqlen_k,
+            kPadSeqLenK,
+            pad_headdim_q,
+            kPadHeadDimQ,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  false, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                      FmhaPipelineProblem>;
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      } else {
+        BOOL_SWITCH_4(
+            pad_seqlen_q,
+            kPadSeqLenQ,
+            pad_seqlen_k,
+            kPadSeqLenK,
+            pad_headdim_q,
+            kPadHeadDimQ,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  false, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              constexpr bool no_any_padding =
+                  !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
+
+              if constexpr (no_any_padding) {
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                        FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<
+                    FmhaTilePartitioner,
+                    FmhaPipeline,
+                    FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              } else {
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                        FmhaPipelineProblem>;
+                using FmhaKernel = FmhaFwdKernel<
+                    FmhaTilePartitioner,
+                    FmhaPipeline,
+                    FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              };
+            });
+      };
+    });
+  };
+
+  template <typename FmhaKernel>
+  static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaKernel::MakeKargs(
+          param.q_ptr,
+          param.k_ptr,
+          param.v_ptr,
+          param.attn_bias_ptr,
+          nullptr, // lse_ptr
+          param.out_ptr,
+          param.M, // seqlen_q
+          param.N, // seqlen_k
+          param.K, // hdim_q
+          param.Kv, // hdim_v
+          param.Hq / param.Hkv, // nhead_ratio_qk
+          param.scale,
+          param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
+          param.k_strides[1],
+          param.v_strides[1],
+          param.attn_bias_strides[2],
+          param.out_strides[1],
+          param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
+          param.k_strides[2],
+          param.v_strides[2],
+          param.attn_bias_strides[1],
+          0, // nhead_stride_lse
+          param.out_strides[2],
+          param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
+          param.k_strides[0],
+          param.v_strides[0],
+          param.attn_bias_strides[0],
+          0, // batch_stride_lse
+          param.out_strides[0],
+          static_cast<CausalMaskType>(param.custom_mask_type),
+          param.window_size);
+    }();
+
+    dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
+    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  };
 };
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-void run_batched_infer_causalmask_attnbias_dispatched(BatchedForwardParams& param,
-                                                      hipStream_t stream)
-{
-    batched_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
-        Run(param, stream);
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+void run_batched_infer_causalmask_attnbias_dispatched(
+    BatchedForwardParams& param,
+    hipStream_t stream) {
+  batched_infer_causalmask_attnbias_dispatched<
+      scalar_t,
+      has_causal_mask,
+      has_attn_bias,
+      HDim>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index 93b7be27a..f0a4edd84 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -49,22 +49,23 @@ extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                 false,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                 true,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_batched_infer_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_batched_infer_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 170af665d..b25041fdf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -49,22 +49,23 @@ extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_batched_infer_causalmask_attnbias_dispatched<ck::half_t,
-                                                                 false,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_batched_infer_causalmask_attnbias_dispatched<ck::half_t,
-                                                                 true,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_batched_infer_causalmask_attnbias_dispatched<
+            ck::half_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_batched_infer_causalmask_attnbias_dispatched<
+            ck::half_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index 8444f097a..a20a8b5bd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -8,75 +8,70 @@
 
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
-enum struct CausalMaskType
-{
-    MaskDisabled,
-    MaskUpperTriangleFromTopLeft,
-    MaskUpperTriangleFromBottomRight
+enum struct CausalMaskType {
+  MaskDisabled,
+  MaskUpperTriangleFromTopLeft,
+  MaskUpperTriangleFromBottomRight
 };
 
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 
 template <>
-struct FmhaFwdTypeConfig<ck::half_t>
-{
-    using QDataType           = ck::half_t;
-    using KDataType           = ck::half_t;
-    using VDataType           = ck::half_t;
-    using BiasDataType        = ck::half_t;
-    using LSEDataType         = float;      // data type for lse(logsumexp L_j = max_j + log(l_j))
-    using SaccDataType        = float;      // data type for first gemm accumulation
-    using SMPLComputeDataType = float;      // data type for reduction, softmax
-    using PDataType           = ck::half_t; // data type for A matrix of second gemm
-    using OaccDataType        = float;      // data type for second gemm accumulation
-    using ODataType           = ck::half_t;
+struct FmhaFwdTypeConfig<ck::half_t> {
+  using QDataType = ck::half_t;
+  using KDataType = ck::half_t;
+  using VDataType = ck::half_t;
+  using BiasDataType = ck::half_t;
+  using LSEDataType =
+      float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+  using SaccDataType = float; // data type for first gemm accumulation
+  using SMPLComputeDataType = float; // data type for reduction, softmax
+  using PDataType = ck::half_t; // data type for A matrix of second gemm
+  using OaccDataType = float; // data type for second gemm accumulation
+  using ODataType = ck::half_t;
 };
 
 template <>
-struct FmhaFwdTypeConfig<ck::bhalf_t>
-{
-    using QDataType           = ck::bhalf_t;
-    using KDataType           = ck::bhalf_t;
-    using VDataType           = ck::bhalf_t;
-    using BiasDataType        = ck::bhalf_t;
-    using LSEDataType         = float;       // data type for lse(logsumexp L_j = max_j + log(l_j))
-    using SaccDataType        = float;       // data type for first gemm accumulation
-    using SMPLComputeDataType = float;       // data type for reduction, softmax
-    using PDataType           = ck::bhalf_t; // data type for A matrix of second gemm
-    using OaccDataType        = float;       // data type for second gemm accumulation
-    using ODataType           = ck::bhalf_t;
+struct FmhaFwdTypeConfig<ck::bhalf_t> {
+  using QDataType = ck::bhalf_t;
+  using KDataType = ck::bhalf_t;
+  using VDataType = ck::bhalf_t;
+  using BiasDataType = ck::bhalf_t;
+  using LSEDataType =
+      float; // data type for lse(logsumexp L_j = max_j + log(l_j))
+  using SaccDataType = float; // data type for first gemm accumulation
+  using SMPLComputeDataType = float; // data type for reduction, softmax
+  using PDataType = ck::bhalf_t; // data type for A matrix of second gemm
+  using OaccDataType = float; // data type for second gemm accumulation
+  using ODataType = ck::bhalf_t;
 };
 
 template <ck::index_t HDim>
 struct FmhaFwdBlockTile;
 
 template <>
-struct FmhaFwdBlockTile<32>
-{
-    using type = ck::Sequence<128, 64, 16, 32, 32, 32>;
+struct FmhaFwdBlockTile<32> {
+  using type = ck::Sequence<128, 64, 16, 32, 32, 32>;
 };
 
 template <>
-struct FmhaFwdBlockTile<64>
-{
-    using type = ck::Sequence<128, 64, 32, 64, 32, 64>;
+struct FmhaFwdBlockTile<64> {
+  using type = ck::Sequence<128, 64, 32, 64, 32, 64>;
 };
 
 template <>
-struct FmhaFwdBlockTile<128>
-{
-    using type = ck::Sequence<128, 128, 32, 128, 32, 128>;
+struct FmhaFwdBlockTile<128> {
+  using type = ck::Sequence<128, 128, 32, 128, 32, 128>;
 };
 
 template <>
-struct FmhaFwdBlockTile<256>
-{
-    using type = ck::Sequence<128, 128, 32, 256, 32, 256>;
+struct FmhaFwdBlockTile<256> {
+  using type = ck::Sequence<128, 128, 32, 256, 32, 256>;
 };
 
 using FmhaFwdBlockWarps = ck::Sequence<4, 1, 1>;
-using FmhaFwdWarpTile   = ck::Sequence<32, 32, 16>;
+using FmhaFwdWarpTile = ck::Sequence<32, 32, 16>;
 
 static constexpr bool IsVLayoutRowMajor = true;
 
@@ -84,41 +79,37 @@ template <ck::index_t HDim>
 struct FmhaFwdShape;
 
 template <>
-struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<32>::type,
-                                                          ck::Sequence<2, 1, 1>,
-                                                          FmhaFwdWarpTile,
-                                                          ck::Sequence<2, 1, 1>,
-                                                          FmhaFwdWarpTile,
-                                                          IsVLayoutRowMajor>
-{
-};
+struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<
+                              typename FmhaFwdBlockTile<32>::type,
+                              ck::Sequence<2, 1, 1>,
+                              FmhaFwdWarpTile,
+                              ck::Sequence<2, 1, 1>,
+                              FmhaFwdWarpTile,
+                              IsVLayoutRowMajor> {};
 
 template <>
-struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<64>::type,
-                                                          FmhaFwdBlockWarps,
-                                                          FmhaFwdWarpTile,
-                                                          FmhaFwdBlockWarps,
-                                                          FmhaFwdWarpTile,
-                                                          IsVLayoutRowMajor>
-{
-};
+struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<
+                              typename FmhaFwdBlockTile<64>::type,
+                              FmhaFwdBlockWarps,
+                              FmhaFwdWarpTile,
+                              FmhaFwdBlockWarps,
+                              FmhaFwdWarpTile,
+                              IsVLayoutRowMajor> {};
 
 template <>
-struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<128>::type,
-                                                           FmhaFwdBlockWarps,
-                                                           FmhaFwdWarpTile,
-                                                           FmhaFwdBlockWarps,
-                                                           FmhaFwdWarpTile,
-                                                           IsVLayoutRowMajor>
-{
-};
+struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<
+                               typename FmhaFwdBlockTile<128>::type,
+                               FmhaFwdBlockWarps,
+                               FmhaFwdWarpTile,
+                               FmhaFwdBlockWarps,
+                               FmhaFwdWarpTile,
+                               IsVLayoutRowMajor> {};
 
 template <>
-struct FmhaFwdShape<256> : ck::tile_program::TileFmhaShape<typename FmhaFwdBlockTile<256>::type,
-                                                           FmhaFwdBlockWarps,
-                                                           FmhaFwdWarpTile,
-                                                           FmhaFwdBlockWarps,
-                                                           FmhaFwdWarpTile,
-                                                           IsVLayoutRowMajor>
-{
-};
+struct FmhaFwdShape<256> : ck::tile_program::TileFmhaShape<
+                               typename FmhaFwdBlockTile<256>::type,
+                               FmhaFwdBlockWarps,
+                               FmhaFwdWarpTile,
+                               FmhaFwdBlockWarps,
+                               FmhaFwdWarpTile,
+                               IsVLayoutRowMajor> {};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 542fed4f1..78c62cfa3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -8,10 +8,10 @@
 
 #include <type_traits>
 
-#include <ck/utility/common_header.hpp>
 #include <ck/tensor/tensor_view.hpp>
-#include <ck/tile_program/tile/tile_window.hpp>
 #include <ck/tile_program/block_tile/block_masking.hpp>
+#include <ck/tile_program/tile/tile_window.hpp>
+#include <ck/utility/common_header.hpp>
 
 #include "ck_tiled_fmha_definitions.h"
 
@@ -21,646 +21,644 @@
 // P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
 // O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
 
-template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
-struct FmhaFwdKernel
-{
-    using TilePartitioner                    = ck::remove_cvref_t<TilePartitioner_>;
-    using FmhaPipeline                       = ck::remove_cvref_t<FmhaPipeline_>;
-    using EpiloguePipeline                   = ck::remove_cvref_t<EpiloguePipeline_>;
-    static constexpr ck::index_t kBlockSize  = FmhaPipeline::kBlockSize;
-    static constexpr ck::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
-
-    using QDataType    = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
-    using KDataType    = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
-    using VDataType    = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
-    using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
-    using LSEDataType  = ck::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
-    using ODataType    = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
-
-    using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
-
-    static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ  = FmhaPipeline::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK  = FmhaPipeline::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
-    static constexpr bool kHasBias     = FmhaPipeline::kHasBias;
-    static constexpr bool kStoreLSE    = FmhaPipeline::kStoreLSE;
-    using FmhaMask                     = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
-    static constexpr bool kHasMask     = FmhaMask::IsMasking;
-
-    template <ck::index_t I> // to avoid duplicated base class prblem, introduce an template arg
-    struct FmhaFwdEmptyKargs
-    {
-    };
-
-    // kargs use aggregate initializer, so no constructor will provided
-    // use inheritance to minimize karg size
-    // user need to use MakeKargs() function to create kargs.
-    struct FmhaFwdCommonKargs
-    {
-        const void* q_ptr;
-        const void* k_ptr;
-        const void* v_ptr;
-        void* o_ptr;
-
-        ck::index_t seqlen_q;
-        ck::index_t seqlen_k;
-        ck::index_t hdim_q;
-        ck::index_t hdim_v;
-
-        // for MQA/GQA, nhead could be different. This parameter is nhead_q / nhead_k
-        // if this param is larger than 1, indicate MQA/GQA case
-        ck::index_t nhead_ratio_qk;
-        float scale;
-
-        ck::index_t stride_q;
-        ck::index_t stride_k;
-        ck::index_t stride_v;
-        ck::index_t stride_o;
-
-        ck::index_t nhead_stride_q;
-        ck::index_t nhead_stride_k;
-        ck::index_t nhead_stride_v;
-        ck::index_t nhead_stride_o;
-    };
-
-    struct FmhaFwdCommonBiasKargs
-    {
-        const void* bias_ptr          = nullptr;
-        ck::index_t stride_bias       = 0;
-        ck::index_t nhead_stride_bias = 0;
-    };
-
-    struct FmhaFwdBatchModeBiasKargs : FmhaFwdCommonBiasKargs
-    {
-        ck::index_t batch_stride_bias = 0;
-    };
-
-    struct FmhaFwdMaskKargs
-    {
-        CausalMaskType mask_type;
-        ck::index_t window_size;
-    };
-
-    struct FmhaFwdCommonLSEKargs
-    {
-        void* lse_ptr                = nullptr;
-        ck::index_t nhead_stride_lse = 0;
-    };
-
-    struct FmhaFwdBatchModeLSEKargs : FmhaFwdCommonLSEKargs
-    {
-        ck::index_t batch_stride_lse = 0;
-    };
-
-    struct FmhaFwdBatchModeKargs
-        : FmhaFwdCommonKargs,
-          std::conditional_t<kHasBias, FmhaFwdBatchModeBiasKargs, FmhaFwdEmptyKargs<0>>,
-          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
-          std::conditional_t<kStoreLSE, FmhaFwdBatchModeLSEKargs, FmhaFwdEmptyKargs<2>>
-    {
-        ck::index_t batch_stride_q;
-        ck::index_t batch_stride_k;
-        ck::index_t batch_stride_v;
-        ck::index_t batch_stride_o;
-    };
-
-    struct FmhaFwdGroupModeKargs
-        : FmhaFwdCommonKargs,
-          std::conditional_t<kHasBias, FmhaFwdCommonBiasKargs, FmhaFwdEmptyKargs<0>>,
-          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
-          std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>
-    {
-        const int32_t* seqstart_q_ptr;
-        const int32_t* seqstart_k_ptr;
-        const int32_t* seqlen_k_ptr;
-    };
-
-    using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
-
-    template <bool Cond = !kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
-                                                                      const void* k_ptr,
-                                                                      const void* v_ptr,
-                                                                      const void* bias_ptr,
-                                                                      void* lse_ptr,
-                                                                      void* o_ptr,
-                                                                      ck::index_t seqlen_q,
-                                                                      ck::index_t seqlen_k,
-                                                                      ck::index_t hdim_q,
-                                                                      ck::index_t hdim_v,
-                                                                      ck::index_t nhead_ratio_qk,
-                                                                      float scale,
-                                                                      ck::index_t stride_q,
-                                                                      ck::index_t stride_k,
-                                                                      ck::index_t stride_v,
-                                                                      ck::index_t stride_bias,
-                                                                      ck::index_t stride_o,
-                                                                      ck::index_t nhead_stride_q,
-                                                                      ck::index_t nhead_stride_k,
-                                                                      ck::index_t nhead_stride_v,
-                                                                      ck::index_t nhead_stride_bias,
-                                                                      ck::index_t nhead_stride_lse,
-                                                                      ck::index_t nhead_stride_o,
-                                                                      ck::index_t batch_stride_q,
-                                                                      ck::index_t batch_stride_k,
-                                                                      ck::index_t batch_stride_v,
-                                                                      ck::index_t batch_stride_bias,
-                                                                      ck::index_t batch_stride_lse,
-                                                                      ck::index_t batch_stride_o,
-                                                                      CausalMaskType mask_type,
-                                                                      ck::index_t window_size)
-    {
-        Kargs kargs{{q_ptr,
-                     k_ptr,
-                     v_ptr,
-                     o_ptr,
-                     seqlen_q,
-                     seqlen_k,
-                     hdim_q,
-                     hdim_v,
-                     nhead_ratio_qk,
+template <
+    typename TilePartitioner_,
+    typename FmhaPipeline_,
+    typename EpiloguePipeline_>
+struct FmhaFwdKernel {
+  using TilePartitioner = ck::remove_cvref_t<TilePartitioner_>;
+  using FmhaPipeline = ck::remove_cvref_t<FmhaPipeline_>;
+  using EpiloguePipeline = ck::remove_cvref_t<EpiloguePipeline_>;
+  static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
+  static constexpr ck::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+
+  using QDataType = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
+  using KDataType = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
+  using VDataType = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
+  using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
+  using LSEDataType = ck::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
+  using ODataType = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
+
+  using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
+
+  static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
+  static constexpr bool kPadSeqLenQ = FmhaPipeline::kPadSeqLenQ;
+  static constexpr bool kPadSeqLenK = FmhaPipeline::kPadSeqLenK;
+  static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
+  static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
+  static constexpr bool kHasBias = FmhaPipeline::kHasBias;
+  static constexpr bool kStoreLSE = FmhaPipeline::kStoreLSE;
+  using FmhaMask = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+  static constexpr bool kHasMask = FmhaMask::IsMasking;
+
+  template <ck::index_t I> // to avoid duplicated base class prblem, introduce
+                           // an template arg
+                           struct FmhaFwdEmptyKargs {};
+
+  // kargs use aggregate initializer, so no constructor will provided
+  // use inheritance to minimize karg size
+  // user need to use MakeKargs() function to create kargs.
+  struct FmhaFwdCommonKargs {
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    void* o_ptr;
+
+    ck::index_t seqlen_q;
+    ck::index_t seqlen_k;
+    ck::index_t hdim_q;
+    ck::index_t hdim_v;
+
+    // for MQA/GQA, nhead could be different. This parameter is nhead_q /
+    // nhead_k if this param is larger than 1, indicate MQA/GQA case
+    ck::index_t nhead_ratio_qk;
+    float scale;
+
+    ck::index_t stride_q;
+    ck::index_t stride_k;
+    ck::index_t stride_v;
+    ck::index_t stride_o;
+
+    ck::index_t nhead_stride_q;
+    ck::index_t nhead_stride_k;
+    ck::index_t nhead_stride_v;
+    ck::index_t nhead_stride_o;
+  };
+
+  struct FmhaFwdCommonBiasKargs {
+    const void* bias_ptr = nullptr;
+    ck::index_t stride_bias = 0;
+    ck::index_t nhead_stride_bias = 0;
+  };
+
+  struct FmhaFwdBatchModeBiasKargs : FmhaFwdCommonBiasKargs {
+    ck::index_t batch_stride_bias = 0;
+  };
+
+  struct FmhaFwdMaskKargs {
+    CausalMaskType mask_type;
+    ck::index_t window_size;
+  };
+
+  struct FmhaFwdCommonLSEKargs {
+    void* lse_ptr = nullptr;
+    ck::index_t nhead_stride_lse = 0;
+  };
+
+  struct FmhaFwdBatchModeLSEKargs : FmhaFwdCommonLSEKargs {
+    ck::index_t batch_stride_lse = 0;
+  };
+
+  struct FmhaFwdBatchModeKargs
+      : FmhaFwdCommonKargs,
+        std::conditional_t<
+            kHasBias,
+            FmhaFwdBatchModeBiasKargs,
+            FmhaFwdEmptyKargs<0>>,
+        std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+        std::conditional_t<
+            kStoreLSE,
+            FmhaFwdBatchModeLSEKargs,
+            FmhaFwdEmptyKargs<2>> {
+    ck::index_t batch_stride_q;
+    ck::index_t batch_stride_k;
+    ck::index_t batch_stride_v;
+    ck::index_t batch_stride_o;
+  };
+
+  struct FmhaFwdGroupModeKargs
+      : FmhaFwdCommonKargs,
+        std::conditional_t<
+            kHasBias,
+            FmhaFwdCommonBiasKargs,
+            FmhaFwdEmptyKargs<0>>,
+        std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+        std::conditional_t<
+            kStoreLSE,
+            FmhaFwdCommonLSEKargs,
+            FmhaFwdEmptyKargs<2>> {
+    const int32_t* seqstart_q_ptr;
+    const int32_t* seqstart_k_ptr;
+    const int32_t* seqlen_k_ptr;
+  };
+
+  using Kargs = std::
+      conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
+
+  template <bool Cond = !kIsGroupMode>
+  __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(
+      const void* q_ptr,
+      const void* k_ptr,
+      const void* v_ptr,
+      const void* bias_ptr,
+      void* lse_ptr,
+      void* o_ptr,
+      ck::index_t seqlen_q,
+      ck::index_t seqlen_k,
+      ck::index_t hdim_q,
+      ck::index_t hdim_v,
+      ck::index_t nhead_ratio_qk,
+      float scale,
+      ck::index_t stride_q,
+      ck::index_t stride_k,
+      ck::index_t stride_v,
+      ck::index_t stride_bias,
+      ck::index_t stride_o,
+      ck::index_t nhead_stride_q,
+      ck::index_t nhead_stride_k,
+      ck::index_t nhead_stride_v,
+      ck::index_t nhead_stride_bias,
+      ck::index_t nhead_stride_lse,
+      ck::index_t nhead_stride_o,
+      ck::index_t batch_stride_q,
+      ck::index_t batch_stride_k,
+      ck::index_t batch_stride_v,
+      ck::index_t batch_stride_bias,
+      ck::index_t batch_stride_lse,
+      ck::index_t batch_stride_o,
+      CausalMaskType mask_type,
+      ck::index_t window_size) {
+    Kargs kargs{
+        {q_ptr,
+         k_ptr,
+         v_ptr,
+         o_ptr,
+         seqlen_q,
+         seqlen_k,
+         hdim_q,
+         hdim_v,
+         nhead_ratio_qk,
 #if CK_FMHA_FWD_FAST_EXP2
-                     static_cast<float>(scale * ck::math::log2e_v<>),
+         static_cast<float>(scale * ck::math::log2e_v<>),
 #else
-                     scale,
+         scale,
 #endif
-                     stride_q,
-                     stride_k,
-                     stride_v,
-                     stride_o,
-                     nhead_stride_q,
-                     nhead_stride_k,
-                     nhead_stride_v,
-                     nhead_stride_o}, // args for common karg
-                    {},               // placeholder for bias
-                    {},               // placeholder for mask
-                    {},               // placeholder for lse
-                    batch_stride_q,
-                    batch_stride_k,
-                    batch_stride_v,
-                    batch_stride_o};
-
-        if constexpr(kHasBias)
-        {
-            kargs.bias_ptr          = bias_ptr;
-            kargs.stride_bias       = stride_bias;
-            kargs.nhead_stride_bias = nhead_stride_bias;
-            kargs.batch_stride_bias = batch_stride_bias;
-        }
-
-        if constexpr(kHasMask)
-        {
-            kargs.mask_type   = mask_type;
-            kargs.window_size = window_size;
-        }
-        if constexpr(kStoreLSE)
-        {
-            kargs.lse_ptr          = lse_ptr;
-            kargs.nhead_stride_lse = nhead_stride_lse;
-            kargs.batch_stride_lse = batch_stride_lse;
-        }
+         stride_q,
+         stride_k,
+         stride_v,
+         stride_o,
+         nhead_stride_q,
+         nhead_stride_k,
+         nhead_stride_v,
+         nhead_stride_o}, // args for common karg
+        {}, // placeholder for bias
+        {}, // placeholder for mask
+        {}, // placeholder for lse
+        batch_stride_q,
+        batch_stride_k,
+        batch_stride_v,
+        batch_stride_o};
+
+    if constexpr (kHasBias) {
+      kargs.bias_ptr = bias_ptr;
+      kargs.stride_bias = stride_bias;
+      kargs.nhead_stride_bias = nhead_stride_bias;
+      kargs.batch_stride_bias = batch_stride_bias;
+    }
 
-        return kargs;
+    if constexpr (kHasMask) {
+      kargs.mask_type = mask_type;
+      kargs.window_size = window_size;
+    }
+    if constexpr (kStoreLSE) {
+      kargs.lse_ptr = lse_ptr;
+      kargs.nhead_stride_lse = nhead_stride_lse;
+      kargs.batch_stride_lse = batch_stride_lse;
     }
 
-    template <bool Cond = kIsGroupMode>
-    __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(const void* q_ptr,
-                                                                      const void* k_ptr,
-                                                                      const void* v_ptr,
-                                                                      const void* bias_ptr,
-                                                                      void* lse_ptr,
-                                                                      void* o_ptr,
-                                                                      const void* seqstart_q_ptr,
-                                                                      const void* seqstart_k_ptr,
-                                                                      const void* seqlen_k_ptr,
-                                                                      ck::index_t hdim_q,
-                                                                      ck::index_t hdim_v,
-                                                                      ck::index_t nhead_ratio_qk,
-                                                                      float scale,
-                                                                      ck::index_t stride_q,
-                                                                      ck::index_t stride_k,
-                                                                      ck::index_t stride_v,
-                                                                      ck::index_t stride_bias,
-                                                                      ck::index_t stride_o,
-                                                                      ck::index_t nhead_stride_q,
-                                                                      ck::index_t nhead_stride_k,
-                                                                      ck::index_t nhead_stride_v,
-                                                                      ck::index_t nhead_stride_bias,
-                                                                      ck::index_t nhead_stride_lse,
-                                                                      ck::index_t nhead_stride_o,
-                                                                      CausalMaskType mask_type,
-                                                                      ck::index_t window_size)
-    {
-        Kargs kargs{{q_ptr,
-                     k_ptr,
-                     v_ptr,
-                     o_ptr,
-                     -1, // seqlen will be updated by another pointer
-                     -1, //
-                     hdim_q,
-                     hdim_v,
-                     nhead_ratio_qk,
+    return kargs;
+  }
+
+  template <bool Cond = kIsGroupMode>
+  __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(
+      const void* q_ptr,
+      const void* k_ptr,
+      const void* v_ptr,
+      const void* bias_ptr,
+      void* lse_ptr,
+      void* o_ptr,
+      const void* seqstart_q_ptr,
+      const void* seqstart_k_ptr,
+      const void* seqlen_k_ptr,
+      ck::index_t hdim_q,
+      ck::index_t hdim_v,
+      ck::index_t nhead_ratio_qk,
+      float scale,
+      ck::index_t stride_q,
+      ck::index_t stride_k,
+      ck::index_t stride_v,
+      ck::index_t stride_bias,
+      ck::index_t stride_o,
+      ck::index_t nhead_stride_q,
+      ck::index_t nhead_stride_k,
+      ck::index_t nhead_stride_v,
+      ck::index_t nhead_stride_bias,
+      ck::index_t nhead_stride_lse,
+      ck::index_t nhead_stride_o,
+      CausalMaskType mask_type,
+      ck::index_t window_size) {
+    Kargs kargs{
+        {q_ptr,
+         k_ptr,
+         v_ptr,
+         o_ptr,
+         -1, // seqlen will be updated by another pointer
+         -1, //
+         hdim_q,
+         hdim_v,
+         nhead_ratio_qk,
 #if CK_FMHA_FWD_FAST_EXP2
-                     static_cast<float>(scale * ck::math::log2e_v<>),
+         static_cast<float>(scale * ck::math::log2e_v<>),
 #else
-                     scale,
+         scale,
 #endif
-                     stride_q,
-                     stride_k,
-                     stride_v,
-                     stride_o,
-                     nhead_stride_q,
-                     nhead_stride_k,
-                     nhead_stride_v,
-                     nhead_stride_o}, // args for common karg
-                    {},               // placeholder for bias
-                    {},               // placeholder for mask
-                    {},               // placeholder for lse
-                    reinterpret_cast<const int32_t*>(seqstart_q_ptr),
-                    reinterpret_cast<const int32_t*>(seqstart_k_ptr),
-                    reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
-
-        if constexpr(kHasBias)
-        {
-            kargs.bias_ptr          = bias_ptr;
-            kargs.stride_bias       = stride_bias;
-            kargs.nhead_stride_bias = nhead_stride_bias;
-        }
-        if constexpr(kHasMask)
-        {
-            kargs.mask_type   = mask_type;
-            kargs.window_size = window_size;
-        }
-        if constexpr(kStoreLSE)
-        {
-            kargs.lse_ptr          = lse_ptr;
-            kargs.nhead_stride_lse = nhead_stride_lse;
-        }
-
-        return kargs;
+         stride_q,
+         stride_k,
+         stride_v,
+         stride_o,
+         nhead_stride_q,
+         nhead_stride_k,
+         nhead_stride_v,
+         nhead_stride_o}, // args for common karg
+        {}, // placeholder for bias
+        {}, // placeholder for mask
+        {}, // placeholder for lse
+        reinterpret_cast<const int32_t*>(seqstart_q_ptr),
+        reinterpret_cast<const int32_t*>(seqstart_k_ptr),
+        reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
+
+    if constexpr (kHasBias) {
+      kargs.bias_ptr = bias_ptr;
+      kargs.stride_bias = stride_bias;
+      kargs.nhead_stride_bias = nhead_stride_bias;
     }
-
-    __host__ static constexpr auto GridSize(ck::index_t batch_size_,
-                                            ck::index_t nhead_,
-                                            ck::index_t seqlen_q_,
-                                            ck::index_t hdim_v_)
-    {
-        return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
+    if constexpr (kHasMask) {
+      kargs.mask_type = mask_type;
+      kargs.window_size = window_size;
     }
-
-    __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
-
-    __host__ __device__ static constexpr ck::index_t GetSmemSize()
-    {
-        return ck::math::max(FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    if constexpr (kStoreLSE) {
+      kargs.lse_ptr = lse_ptr;
+      kargs.nhead_stride_lse = nhead_stride_lse;
     }
 
-    __device__ void operator()(Kargs kargs) const
-    {
-        using namespace ck;
-        using namespace ck::tile_program;
-        using namespace ck::tile_program::block;
-
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
-
-        // divide problem
-        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
-            TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
-
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-        long_index_t batch_offset_q    = 0;
-        long_index_t batch_offset_k    = 0;
-        long_index_t batch_offset_v    = 0;
-        long_index_t batch_offset_bias = 0;
-        long_index_t batch_offset_lse  = 0;
-        long_index_t batch_offset_o    = 0;
-
-        if constexpr(kIsGroupMode)
-        {
-            // get starting offset for each batch
-            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
-
-            batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
-            if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
-            {
-                batch_offset_v = key_start * kargs.stride_v;
-            }
-            else
-            {
-                batch_offset_v = key_start;
-            }
-            if constexpr(kHasBias)
-            {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
-            }
-            else
-            {
-                batch_offset_bias = key_start;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = query_start;
-            }
-            batch_offset_o = query_start * kargs.stride_o;
-
-            // get real # queries & # keys under group mode
-            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
-
-            // # of required blocks is different in each groups, terminate unnecessary blocks
-            // earlier
-            if(kargs.seqlen_q <= i_m0)
-            {
-                return;
-            }
-
-            if(kargs.seqlen_k_ptr != nullptr)
-            {
-                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
-            }
-            else
-            {
-                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
-            }
-        }
-        else
-        {
-            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-            batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-            batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-            if constexpr(kHasBias)
-            {
-                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-            }
-            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-        }
+    return kargs;
+  }
+
+  __host__ static constexpr auto GridSize(
+      ck::index_t batch_size_,
+      ck::index_t nhead_,
+      ck::index_t seqlen_q_,
+      ck::index_t hdim_v_) {
+    return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
+  }
+
+  __host__ static constexpr auto BlockSize() {
+    return dim3(kBlockSize);
+  }
+
+  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
+    return ck::math::max(
+        FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+  }
+
+  __device__ void operator()(Kargs kargs) const {
+    using namespace ck;
+    using namespace ck::tile_program;
+    using namespace ck::tile_program::block;
+
+    // allocate LDS
+    __shared__ char smem_ptr[GetSmemSize()];
+
+    // divide problem
+    const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
+        TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
+
+    const index_t i_m0 =
+        __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+    const index_t i_n1 =
+        __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+    long_index_t batch_offset_q = 0;
+    long_index_t batch_offset_k = 0;
+    long_index_t batch_offset_v = 0;
+    long_index_t batch_offset_bias = 0;
+    long_index_t batch_offset_lse = 0;
+    long_index_t batch_offset_o = 0;
+
+    if constexpr (kIsGroupMode) {
+      // get starting offset for each batch
+      const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+      const long_index_t key_start = kargs.seqstart_k_ptr[i_batch];
+
+      batch_offset_q = query_start * kargs.stride_q;
+      batch_offset_k = key_start * kargs.stride_k;
+      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
+        batch_offset_v = key_start * kargs.stride_v;
+      } else {
+        batch_offset_v = key_start;
+      }
+      if constexpr (kHasBias) {
+        batch_offset_bias = query_start * kargs.stride_bias + key_start;
+      } else {
+        batch_offset_bias = key_start;
+      }
+      if constexpr (kStoreLSE) {
+        batch_offset_lse = query_start;
+      }
+      batch_offset_o = query_start * kargs.stride_o;
+
+      // get real # queries & # keys under group mode
+      const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+      kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+      // # of required blocks is different in each groups, terminate unnecessary
+      // blocks earlier
+      if (kargs.seqlen_q <= i_m0) {
+        return;
+      }
+
+      if (kargs.seqlen_k_ptr != nullptr) {
+        kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+      } else {
+        const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+        kargs.seqlen_k =
+            adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+      }
+    } else {
+      batch_offset_q =
+          static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+      batch_offset_k =
+          static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+      batch_offset_v =
+          static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+      if constexpr (kHasBias) {
+        batch_offset_bias =
+            static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+      }
+      if constexpr (kStoreLSE) {
+        batch_offset_lse =
+            static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+      }
+      batch_offset_o =
+          static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+    }
 
-        // for simplicity, batch stride we just modify the pointer
-        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-                                 batch_offset_q;
-        const KDataType* k_ptr =
-            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
-            batch_offset_k;
-        const VDataType* v_ptr =
-            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
-            batch_offset_v;
-        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-                           batch_offset_o;
-
-        // Q/K/V DRAM and DRAM window
-        const auto q_dram = [&]() {
-            const auto q_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                q_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                make_tuple(kargs.stride_q, 1),
+    // for simplicity, batch stride we just modify the pointer
+    const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+        static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+        batch_offset_q;
+    const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+        static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) *
+            kargs.nhead_stride_k +
+        batch_offset_k;
+    const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+        static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) *
+            kargs.nhead_stride_v +
+        batch_offset_v;
+    ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+        static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+        batch_offset_o;
+
+    // Q/K/V DRAM and DRAM window
+    const auto q_dram = [&]() {
+      const auto q_dram_naive =
+          make_naive_tensor_view<AddressSpaceEnum::Global>(
+              q_ptr,
+              make_tuple(kargs.seqlen_q, kargs.hdim_q),
+              make_tuple(kargs.stride_q, 1),
+              Number<32>{},
+              Number<1>{});
+      if constexpr (FmhaPipeline::kQLoadOnce) {
+        return pad_tensor_view(
+            q_dram_naive,
+            make_tuple(
+                Number<FmhaPipeline::kM0>{},
+                Number<FmhaPipeline::kK0BlockLength>{}),
+            Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+      } else {
+        return pad_tensor_view(
+            q_dram_naive,
+            make_tuple(
+                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{}),
+            Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+      }
+    }();
+    const auto k_dram = [&]() {
+      const auto k_dram_naive =
+          make_naive_tensor_view<AddressSpaceEnum::Global>(
+              k_ptr,
+              make_tuple(kargs.seqlen_k, kargs.hdim_q),
+              make_tuple(kargs.stride_k, 1),
+              Number<32>{},
+              Number<1>{});
+
+      return pad_tensor_view(
+          k_dram_naive,
+          make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
+          Sequence<kPadSeqLenK, kPadHeadDimQ>{});
+    }();
+    const auto v_dram = [&]() {
+      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
+        const auto v_dram_naive =
+            make_naive_tensor_view<AddressSpaceEnum::Global>(
+                v_ptr,
+                make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                make_tuple(kargs.stride_v, 1),
                 Number<32>{},
                 Number<1>{});
-            if constexpr(FmhaPipeline::kQLoadOnce)
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0BlockLength>{}),
-                    Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{}),
-                    Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-        }();
-        const auto k_dram = [&]() {
-            const auto k_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                k_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_q),
-                make_tuple(kargs.stride_k, 1),
+
+        const auto v_dram_transposed = transform_tensor_view(
+            v_dram_naive,
+            make_tuple(
+                make_pass_through_transform(kargs.seqlen_k),
+                make_pass_through_transform(kargs.hdim_v)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        return pad_tensor_view(
+            v_dram_transposed,
+            make_tuple(
+                Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+            Sequence<kPadHeadDimV, kPadSeqLenK>{});
+      } else {
+        const auto v_dram_naive =
+            make_naive_tensor_view<AddressSpaceEnum::Global>(
+                v_ptr,
+                make_tuple(kargs.hdim_v, kargs.seqlen_k),
+                make_tuple(kargs.stride_v, 1),
                 Number<32>{},
                 Number<1>{});
 
-            return pad_tensor_view(
-                k_dram_naive,
-                make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
-                Sequence<kPadSeqLenK, kPadHeadDimQ>{});
-        }();
-        const auto v_dram = [&]() {
-            if constexpr(ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>)
-            {
-                const auto v_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                    v_ptr,
-                    make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                    make_tuple(kargs.stride_v, 1),
-                    Number<32>{},
-                    Number<1>{});
-
-                const auto v_dram_transposed =
-                    transform_tensor_view(v_dram_naive,
-                                          make_tuple(make_pass_through_transform(kargs.seqlen_k),
-                                                     make_pass_through_transform(kargs.hdim_v)),
-                                          make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                          make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-                return pad_tensor_view(
-                    v_dram_transposed,
-                    make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-                    Sequence<kPadHeadDimV, kPadSeqLenK>{});
-            }
-            else
-            {
-                const auto v_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                    v_ptr,
-                    make_tuple(kargs.hdim_v, kargs.seqlen_k),
-                    make_tuple(kargs.stride_v, 1),
-                    Number<32>{},
-                    Number<1>{});
-
-                return pad_tensor_view(
-                    v_dram_naive,
-                    make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-                    Sequence<kPadHeadDimV, kPadSeqLenK>{});
-            }
-        }();
-
-        auto q_dram_window = make_tile_window(
-            q_dram,
-            [&]() {
-                if constexpr(FmhaPipeline::kQLoadOnce)
-                    return make_tuple(Number<FmhaPipeline::kM0>{},
-                                      Number<FmhaPipeline::kK0BlockLength>{});
-                else
-                    return make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
-            }(),
-            {i_m0, 0});
-
-        auto k_dram_window = make_tile_window(
-            k_dram, make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}), {0, 0});
-
-        auto v_dram_window =
-            make_tile_window(v_dram,
-                             make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-                             {i_n1, 0});
-        /// FIXME: Before C++20, capturing structured binding variables is not supported. Remove
-        /// following copy capture of the 'i_nhead'
-        ///        if compiled in C++20
-        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto bias_dram_window_lengths =
-                make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
-            if constexpr(kHasBias)
-            {
-                const BiasDataType* bias_ptr =
-                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-                    batch_offset_bias;
-
-                const auto bias_dram = [&]() {
-                    const auto bias_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                        bias_ptr,
-                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                        make_tuple(kargs.stride_bias, 1),
-                        Number<32>{},
-                        Number<1>{});
-
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           Sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(bias_dram_window_lengths);
-            }
+        return pad_tensor_view(
+            v_dram_naive,
+            make_tuple(
+                Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+            Sequence<kPadHeadDimV, kPadSeqLenK>{});
+      }
+    }();
+
+    auto q_dram_window = make_tile_window(
+        q_dram,
+        [&]() {
+          if constexpr (FmhaPipeline::kQLoadOnce)
+            return make_tuple(
+                Number<FmhaPipeline::kM0>{},
+                Number<FmhaPipeline::kK0BlockLength>{});
+          else
+            return make_tuple(
+                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
+        }(),
+        {i_m0, 0});
+
+    auto k_dram_window = make_tile_window(
+        k_dram,
+        make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
+        {0, 0});
+
+    auto v_dram_window = make_tile_window(
+        v_dram,
+        make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
+        {i_n1, 0});
+    /// FIXME: Before C++20, capturing structured binding variables is not
+    /// supported. Remove following copy capture of the 'i_nhead'
+    ///        if compiled in C++20
+    const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+      constexpr auto bias_dram_window_lengths =
+          make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
+      if constexpr (kHasBias) {
+        const BiasDataType* bias_ptr =
+            reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+            static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+            batch_offset_bias;
+
+        const auto bias_dram = [&]() {
+          const auto bias_dram_naive =
+              make_naive_tensor_view<AddressSpaceEnum::Global>(
+                  bias_ptr,
+                  make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                  make_tuple(kargs.stride_bias, 1),
+                  Number<32>{},
+                  Number<1>{});
+
+          return pad_tensor_view(
+              bias_dram_naive,
+              bias_dram_window_lengths,
+              Sequence<kPadSeqLenQ, kPadSeqLenK>{});
         }();
 
-        // lse
-        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto lse_dram_window_lengths = make_tuple(Number<FmhaPipeline::kM0>{});
-            if constexpr(kStoreLSE)
-            {
-                LSEDataType* lse_ptr =
-                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
-
-                const auto lse_dram = [&]() {
-                    const auto lse_dram_naive =
-                        make_naive_tensor_view<AddressSpaceEnum::Global>(lse_ptr,
-                                                                         make_tuple(kargs.seqlen_q),
-                                                                         make_tuple(1),
-                                                                         Number<1>{},
-                                                                         Number<1>{});
-
-                    return pad_tensor_view(
-                        lse_dram_naive, lse_dram_window_lengths, Sequence<kPadSeqLenQ>{});
-                }();
-
-                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-            }
-            else
-            {
-                return make_null_tile_window(lse_dram_window_lengths);
-            }
+        return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+      } else {
+        return make_null_tile_window(bias_dram_window_lengths);
+      }
+    }();
+
+    // lse
+    auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+      constexpr auto lse_dram_window_lengths =
+          make_tuple(Number<FmhaPipeline::kM0>{});
+      if constexpr (kStoreLSE) {
+        LSEDataType* lse_ptr = reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+            static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+            batch_offset_lse;
+
+        const auto lse_dram = [&]() {
+          const auto lse_dram_naive =
+              make_naive_tensor_view<AddressSpaceEnum::Global>(
+                  lse_ptr,
+                  make_tuple(kargs.seqlen_q),
+                  make_tuple(1),
+                  Number<1>{},
+                  Number<1>{});
+
+          return pad_tensor_view(
+              lse_dram_naive, lse_dram_window_lengths, Sequence<kPadSeqLenQ>{});
         }();
 
-        FmhaMask mask = [&]() {
-            if constexpr(kHasMask)
-            {
-                auto res =
-                    ck::make_tuple(ck::index_t{0}, ck::index_t{0}, ck::index_t{0}, ck::index_t{0});
-
-                if(kargs.window_size > 0)
-                {
-                    if(kargs.mask_type == CausalMaskType::MaskDisabled)
-                    {
-                        ck::index_t left_size  = kargs.window_size / 2;
-                        ck::index_t right_size = kargs.window_size - 1 - left_size;
-
-                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            left_size, right_size, kargs.seqlen_q, kargs.seqlen_k);
-                    }
-                    else
-                    {
-                        bool is_topleft =
-                            (kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft);
-
-                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            kargs.window_size - 1, 0, kargs.seqlen_q, kargs.seqlen_k, is_topleft);
-                    }
-                }
-                else
-                {
-                    if(kargs.mask_type == CausalMaskType::MaskDisabled)
-                    {
-                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            -1, -1, kargs.seqlen_q, kargs.seqlen_k);
-                    }
-                    else
-                    {
-                        bool is_topleft =
-                            (kargs.mask_type == CausalMaskType::MaskUpperTriangleFromTopLeft);
-
-                        res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                            -1, 0, kargs.seqlen_q, kargs.seqlen_k, is_topleft);
-                    }
-                }
-
-                auto y = res.At(ck::Number<0>{});
-                auto x = res.At(ck::Number<1>{});
-
-                return FmhaMask{y, x, kargs.seqlen_q, kargs.seqlen_k};
-            }
-            else
-                return FmhaMask{0, 0, kargs.seqlen_q, kargs.seqlen_k};
-        }();
-
-        auto o_acc_tile =
-            FmhaPipeline{}(q_dram_window,
-                           k_dram_window,
-                           v_dram_window,
-                           bias_dram_window,
-                           lse_dram_window,
-                           mask,
-                           kargs.scale,
-                           // ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
-                           // ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
-                           smem_ptr);
-
-        // O DRAM and O DRAM window
-        auto o_dram = [&]() {
-            const auto o_dram_naive = make_naive_tensor_view<AddressSpaceEnum::Global>(
-                o_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_v),
-                make_tuple(kargs.stride_o, 1),
-                Number<32>{},
-                Number<1>{});
-
-            return pad_tensor_view(
-                o_dram_naive,
-                make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-                Sequence<kPadSeqLenQ, kPadHeadDimV>{});
-        }();
-
-        auto o_dram_window =
-            make_tile_window(o_dram,
-                             make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-                             {i_m0, i_n1});
+        return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+      } else {
+        return make_null_tile_window(lse_dram_window_lengths);
+      }
+    }();
+
+    FmhaMask mask = [&]() {
+      if constexpr (kHasMask) {
+        auto res = ck::make_tuple(
+            ck::index_t{0}, ck::index_t{0}, ck::index_t{0}, ck::index_t{0});
+
+        if (kargs.window_size > 0) {
+          if (kargs.mask_type == CausalMaskType::MaskDisabled) {
+            ck::index_t left_size = kargs.window_size / 2;
+            ck::index_t right_size = kargs.window_size - 1 - left_size;
+
+            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                left_size, right_size, kargs.seqlen_q, kargs.seqlen_k);
+          } else {
+            bool is_topleft =
+                (kargs.mask_type ==
+                 CausalMaskType::MaskUpperTriangleFromTopLeft);
+
+            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                kargs.window_size - 1,
+                0,
+                kargs.seqlen_q,
+                kargs.seqlen_k,
+                is_topleft);
+          }
+        } else {
+          if (kargs.mask_type == CausalMaskType::MaskDisabled) {
+            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                -1, -1, kargs.seqlen_q, kargs.seqlen_k);
+          } else {
+            bool is_topleft =
+                (kargs.mask_type ==
+                 CausalMaskType::MaskUpperTriangleFromTopLeft);
+
+            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
+                -1, 0, kargs.seqlen_q, kargs.seqlen_k, is_topleft);
+          }
+        }
 
-        EpiloguePipeline{}(o_dram_window, o_acc_tile);
-    }
+        auto y = res.At(ck::Number<0>{});
+        auto x = res.At(ck::Number<1>{});
+
+        return FmhaMask{y, x, kargs.seqlen_q, kargs.seqlen_k};
+      } else
+        return FmhaMask{0, 0, kargs.seqlen_q, kargs.seqlen_k};
+    }();
+
+    auto o_acc_tile = FmhaPipeline{}(
+        q_dram_window,
+        k_dram_window,
+        v_dram_window,
+        bias_dram_window,
+        lse_dram_window,
+        mask,
+        kargs.scale,
+        // ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
+        // ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
+        smem_ptr);
+
+    // O DRAM and O DRAM window
+    auto o_dram = [&]() {
+      const auto o_dram_naive =
+          make_naive_tensor_view<AddressSpaceEnum::Global>(
+              o_ptr,
+              make_tuple(kargs.seqlen_q, kargs.hdim_v),
+              make_tuple(kargs.stride_o, 1),
+              Number<32>{},
+              Number<1>{});
+
+      return pad_tensor_view(
+          o_dram_naive,
+          make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
+          Sequence<kPadSeqLenQ, kPadHeadDimV>{});
+    }();
+
+    auto o_dram_window = make_tile_window(
+        o_dram,
+        make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
+        {i_m0, i_n1});
+
+    EpiloguePipeline{}(o_dram_window, o_acc_tile);
+  }
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
index 72c1c4a9b..9dde0c97c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
@@ -6,33 +6,35 @@
  */
 #pragma once
 
-#include "ck/utility/common_header.hpp"
 #include "ck/tile_program/tile/store_tile.hpp"
 #include "ck/tile_program/tile/tile_elementwise.hpp"
+#include "ck/utility/common_header.hpp"
 
 template <typename OaccDataType_, typename ODataType_>
-struct FmhaFwdEpilogueProblem
-{
-    using OaccDataType = ck::remove_cvref_t<OaccDataType_>;
-    using ODataType    = ck::remove_cvref_t<ODataType_>;
+struct FmhaFwdEpilogueProblem {
+  using OaccDataType = ck::remove_cvref_t<OaccDataType_>;
+  using ODataType = ck::remove_cvref_t<ODataType_>;
 };
 
 template <typename Problem_, typename Policy_ = void>
-struct FmhaFwdEpilogue
-{
-    using Problem      = ck::remove_cvref_t<Problem_>;
-    using OaccDataType = ck::remove_cvref_t<typename Problem::OaccDataType>;
-    using ODataType    = ck::remove_cvref_t<typename Problem::ODataType>;
+struct FmhaFwdEpilogue {
+  using Problem = ck::remove_cvref_t<Problem_>;
+  using OaccDataType = ck::remove_cvref_t<typename Problem::OaccDataType>;
+  using ODataType = ck::remove_cvref_t<typename Problem::ODataType>;
 
-    __host__ __device__ static constexpr ck::index_t GetSmemSize() { return 0; }
+  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
+    return 0;
+  }
 
-    template <typename ODramWindowTmp, typename OAccTile>
-    __device__ auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile)
-    {
-        using namespace ck;
-        using namespace ck::tile_program;
+  template <typename ODramWindowTmp, typename OAccTile>
+  __device__ auto operator()(
+      ODramWindowTmp& o_dram_window_tmp,
+      const OAccTile& o_acc_tile) {
+    using namespace ck;
+    using namespace ck::tile_program;
 
-        const auto o = tile_elementwise_in(type_convert<ODataType, OaccDataType>, o_acc_tile);
-        store_tile(o_dram_window_tmp, o);
-    }
+    const auto o =
+        tile_elementwise_in(type_convert<ODataType, OaccDataType>, o_acc_tile);
+    store_tile(o_dram_window_tmp, o);
+  }
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
index 1067eaf7b..34537d707 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
@@ -6,52 +6,51 @@
  */
 #pragma once
 
-#include "ck/utility/common_header.hpp"
 #include "ck/tile_program/tile/store_tile.hpp"
 #include "ck/tile_program/tile/tile_elementwise.hpp"
+#include "ck/utility/common_header.hpp"
 
 template <typename BlockFmhaShape_>
-struct FmhaFwdTilePartitioner
-{
-    using BlockFmhaShape = ck::remove_cvref_t<BlockFmhaShape_>;
-
-    static constexpr ck::index_t kM0 = BlockFmhaShape::kM0;
-    static constexpr ck::index_t kN0 = BlockFmhaShape::kN0;
-    static constexpr ck::index_t kK0 = BlockFmhaShape::kK0;
-    static constexpr ck::index_t kN1 = BlockFmhaShape::kN1;
-    static constexpr ck::index_t kK1 = BlockFmhaShape::kK1;
-
-    __host__ static constexpr auto GridSize(ck::index_t batch_size_,
-                                            ck::index_t nhead_,
-                                            ck::index_t seqlen_q_,
-                                            ck::index_t hdim_v_)
-    {
-        // TODO: this may need tuning
-        return dim3(ck::math::integer_divide_ceil(seqlen_q_, kM0) *
-                        ck::math::integer_divide_ceil(hdim_v_, kN1),
-                    nhead_,
-                    batch_size_);
-    }
-
-    __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v)
-    {
-        using namespace ck;
-
-        // const index_t num_tile_m0 = seqlen_q / kM0;
-        const index_t num_tile_n1 = ck::math::integer_divide_ceil(hdim_v, kN1);
-
-        const index_t i_block = blockIdx.x;
-        const index_t i_nhead = blockIdx.y;
-        const index_t i_batch = blockIdx.z;
-
-        const auto f = [](index_t dividend, index_t divisor) {
-            index_t quotient = dividend / divisor;
-            index_t modulus  = dividend - quotient * divisor;
-            return ck::make_tuple(quotient, modulus);
-        };
-
-        const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-        return ck::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
-    }
+struct FmhaFwdTilePartitioner {
+  using BlockFmhaShape = ck::remove_cvref_t<BlockFmhaShape_>;
+
+  static constexpr ck::index_t kM0 = BlockFmhaShape::kM0;
+  static constexpr ck::index_t kN0 = BlockFmhaShape::kN0;
+  static constexpr ck::index_t kK0 = BlockFmhaShape::kK0;
+  static constexpr ck::index_t kN1 = BlockFmhaShape::kN1;
+  static constexpr ck::index_t kK1 = BlockFmhaShape::kK1;
+
+  __host__ static constexpr auto GridSize(
+      ck::index_t batch_size_,
+      ck::index_t nhead_,
+      ck::index_t seqlen_q_,
+      ck::index_t hdim_v_) {
+    // TODO: this may need tuning
+    return dim3(
+        ck::math::integer_divide_ceil(seqlen_q_, kM0) *
+            ck::math::integer_divide_ceil(hdim_v_, kN1),
+        nhead_,
+        batch_size_);
+  }
+
+  __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v) {
+    using namespace ck;
+
+    // const index_t num_tile_m0 = seqlen_q / kM0;
+    const index_t num_tile_n1 = ck::math::integer_divide_ceil(hdim_v, kN1);
+
+    const index_t i_block = blockIdx.x;
+    const index_t i_nhead = blockIdx.y;
+    const index_t i_batch = blockIdx.z;
+
+    const auto f = [](index_t dividend, index_t divisor) {
+      index_t quotient = dividend / divisor;
+      index_t modulus = dividend - quotient * divisor;
+      return ck::make_tuple(quotient, modulus);
+    };
+
+    const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+    return ck::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+  }
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 7b8707aa3..33eb580c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -6,170 +6,194 @@
  */
 #pragma once
 
+#include <iostream>
 #include <optional>
 #include <sstream>
 #include <stdexcept>
-#include <iostream>
 
-#include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
 #include <ck/tensor_description/cluster_descriptor.hpp>
 #include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/tensor/tensor_view.hpp>
+#include <ck/utility/common_header.hpp>
 
+#include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
-#include <ck/tile_program/block_tile/block_masking.hpp>
 
+#include "ck_tiled_fmha_definitions.h"
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
-#include "ck_tiled_fmha_definitions.h"
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_headdim_switch.h"
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-struct grouped_forward_causalmask_attnbias_dispatched
-{
-    using FmhaEpilogue =
-        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
-    template <typename FmhaTraits, typename FmhaMask>
-    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
-        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        FmhaFwdShape<HDim>,
-        true, // kIsGroupMode
-        FmhaMask,
-        FmhaTraits>;
-
-    static void Run(GroupedForwardParams& param, hipStream_t stream)
-    {
-        const bool has_local_attention = (param.window_size > 0) ? true : false;
-
-        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
-
-            using FmhaMask =
-                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
-
-            using FmhaShape                 = FmhaFwdShape<HDim>;
-            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : (HDim == 256) ? 1 : 2;
-
-            constexpr bool kPadSeqLenQ = true;
-            constexpr bool kPadSeqLenK = true;
-
-            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
-
-            if constexpr(HDim == 256)
-            {
-                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                        kPadSeqLenK,
-                                                                        kPadHeadDimQ,
-                                                                        kPadHeadDimV,
-                                                                        has_attn_bias,
-                                                                        true, // kStoreLSE
-                                                                        occupancy>;
-
-                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                });
-            }
-            else
-            {
-                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                        kPadSeqLenK,
-                                                                        kPadHeadDimQ,
-                                                                        kPadHeadDimV,
-                                                                        has_attn_bias,
-                                                                        true, // kStoreLSE
-                                                                        occupancy>;
-
-                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                });
-            };
-        });
-    };
-
-    template <typename FmhaKernel>
-    static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream)
-    {
-        const auto kargs = [&] {
-            return FmhaKernel::MakeKargs(
-                param.q_ptr,
-                param.k_ptr,
-                param.v_ptr,
-                param.attn_bias_ptr,
-                param.logsumexp_ptr,
-                param.out_ptr,
-                param.seqstart_q_dev_ptr,
-                param.seqstart_k_dev_ptr,
-                param.seqlen_k_dev_ptr,
-                param.K,              // hdim_q
-                param.Kv,             // hdim_v
-                param.Hq / param.Hkv, // nhead_ratio_qk
-                param.scale,
-                param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
-                param.k_strides[0],
-                param.v_strides[0],
-                param.attn_bias_strides[2],
-                param.out_strides[0],
-                param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
-                param.k_strides[1],
-                param.v_strides[1],
-                param.attn_bias_strides[1],
-                param.max_seqlen_q, // nhead_stride_lse
-                param.out_strides[1],
-                static_cast<CausalMaskType>(param.custom_mask_type),
-                param.window_size);
-        }();
-
-        dim3 kGridSize =
-            FmhaKernel::GridSize(param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
-        constexpr dim3 kBlockSize         = FmhaKernel::BlockSize();
-        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
-
-        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
-    };
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+struct grouped_forward_causalmask_attnbias_dispatched {
+  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
+
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaPipelineProblemTemp =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          FmhaFwdShape<HDim>,
+          true, // kIsGroupMode
+          FmhaMask,
+          FmhaTraits>;
+
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+    BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+      using FmhaMask = ck::tile_program::block::
+          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+      constexpr ck::index_t occupancy =
+          (HDim == 64) ? 3 : (HDim == 256) ? 1 : 2;
+
+      constexpr bool kPadSeqLenQ = true;
+      constexpr bool kPadSeqLenK = true;
+
+      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+
+      if constexpr (HDim == 256) {
+        BOOL_SWITCH_2(
+            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  true, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                      FmhaPipelineProblem>;
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      } else {
+        BOOL_SWITCH_2(
+            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  true, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                      FmhaPipelineProblem>;
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      };
+    });
+  };
+
+  template <typename FmhaKernel>
+  static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaKernel::MakeKargs(
+          param.q_ptr,
+          param.k_ptr,
+          param.v_ptr,
+          param.attn_bias_ptr,
+          param.logsumexp_ptr,
+          param.out_ptr,
+          param.seqstart_q_dev_ptr,
+          param.seqstart_k_dev_ptr,
+          param.seqlen_k_dev_ptr,
+          param.K, // hdim_q
+          param.Kv, // hdim_v
+          param.Hq / param.Hkv, // nhead_ratio_qk
+          param.scale,
+          param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
+          param.k_strides[0],
+          param.v_strides[0],
+          param.attn_bias_strides[2],
+          param.out_strides[0],
+          param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
+          param.k_strides[1],
+          param.v_strides[1],
+          param.attn_bias_strides[1],
+          param.max_seqlen_q, // nhead_stride_lse
+          param.out_strides[1],
+          static_cast<CausalMaskType>(param.custom_mask_type),
+          param.window_size);
+    }();
+
+    dim3 kGridSize = FmhaKernel::GridSize(
+        param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
+    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  };
 };
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-void run_grouped_forward_causalmask_attnbias_dispatched(GroupedForwardParams& param,
-                                                        hipStream_t stream)
-{
-    grouped_forward_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
-        Run(param, stream);
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+void run_grouped_forward_causalmask_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_forward_causalmask_attnbias_dispatched<
+      scalar_t,
+      has_causal_mask,
+      has_attn_bias,
+      HDim>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
index 5606f13e5..db313f3ef 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -49,22 +49,23 @@ extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhal
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                   false,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                   true,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_grouped_forward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_grouped_forward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index 63b3e7b96..2e807d3a5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -49,22 +49,23 @@ extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t,
-                                                                   false,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t,
-                                                                   true,
-                                                                   HAS_ATTN_BIAS,
-                                                                   HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_grouped_forward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_grouped_forward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 31849f7b6..11b2857fd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -6,170 +6,194 @@
  */
 #pragma once
 
+#include <iostream>
 #include <optional>
 #include <sstream>
 #include <stdexcept>
-#include <iostream>
 
-#include <ck/utility/common_header.hpp>
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
 #include <ck/tensor_description/cluster_descriptor.hpp>
 #include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/tensor/tensor_view.hpp>
+#include <ck/utility/common_header.hpp>
 
+#include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
-#include <ck/tile_program/block_tile/block_masking.hpp>
 
+#include "ck_tiled_fmha_definitions.h"
 #include "ck_tiled_fmha_forward_kernel.h"
 #include "ck_tiled_fmha_fwd_epilogue.h"
 #include "ck_tiled_fmha_fwd_tile_partitioner.h"
 #include "ck_tiled_fmha_params.h"
-#include "ck_tiled_fmha_definitions.h"
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_headdim_switch.h"
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-struct grouped_infer_causalmask_attnbias_dispatched
-{
-    using FmhaEpilogue =
-        FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                                               typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
-    template <typename FmhaTraits, typename FmhaMask>
-    using FmhaPipelineProblemTemp = ck::tile_program::block::BlockFmhaPipelineProblem<
-        typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-        typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-        FmhaFwdShape<HDim>,
-        true, // kIsGroupMode
-        FmhaMask,
-        FmhaTraits>;
-
-    static void Run(GroupedForwardParams& param, hipStream_t stream)
-    {
-        const bool has_local_attention = (param.window_size > 0) ? true : false;
-
-        BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-            constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
-
-            using FmhaMask =
-                ck::tile_program::block::GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
-
-            using FmhaShape                 = FmhaFwdShape<HDim>;
-            using FmhaTilePartitioner       = FmhaFwdTilePartitioner<FmhaShape>;
-            constexpr ck::index_t occupancy = (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
-
-            constexpr bool kPadSeqLenQ = true;
-            constexpr bool kPadSeqLenK = true;
-
-            bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-            bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
-
-            if constexpr(HDim == 256)
-            {
-                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                        kPadSeqLenK,
-                                                                        kPadHeadDimQ,
-                                                                        kPadHeadDimV,
-                                                                        has_attn_bias,
-                                                                        false, // kStoreLSE
-                                                                        occupancy>;
-
-                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQSKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                });
-            }
-            else
-            {
-                BOOL_SWITCH_2(pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-                    using FmhaTraits = ck::tile_program::TileFmhaTraits<kPadSeqLenQ,
-                                                                        kPadSeqLenK,
-                                                                        kPadHeadDimQ,
-                                                                        kPadHeadDimV,
-                                                                        has_attn_bias,
-                                                                        false, // kStoreLSE
-                                                                        occupancy>;
-
-                    using FmhaPipelineProblem = FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                    using FmhaPipeline =
-                        ck::tile_program::block::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
-                    using FmhaKernel =
-                        FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-                    RunWithKernel<FmhaKernel>(param, stream);
-                });
-            };
-        });
-    };
-
-    template <typename FmhaKernel>
-    static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream)
-    {
-        const auto kargs = [&] {
-            return FmhaKernel::MakeKargs(
-                param.q_ptr,
-                param.k_ptr,
-                param.v_ptr,
-                param.attn_bias_ptr,
-                nullptr, // lse_ptr
-                param.out_ptr,
-                param.seqstart_q_dev_ptr,
-                param.seqstart_k_dev_ptr,
-                param.seqlen_k_dev_ptr,
-                param.K,              // hdim_q
-                param.Kv,             // hdim_v
-                param.Hq / param.Hkv, // nhead_ratio_qk
-                param.scale,
-                param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
-                param.k_strides[0],
-                param.v_strides[0],
-                param.attn_bias_strides[2],
-                param.out_strides[0],
-                param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
-                param.k_strides[1],
-                param.v_strides[1],
-                param.attn_bias_strides[1],
-                0, // nhead_stride_lse
-                param.out_strides[1],
-                static_cast<CausalMaskType>(param.custom_mask_type),
-                param.window_size);
-        }();
-
-        dim3 kGridSize =
-            FmhaKernel::GridSize(param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
-        constexpr dim3 kBlockSize         = FmhaKernel::BlockSize();
-        constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
-
-        (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-            StreamConfig{stream, false}, FmhaKernel{}, kGridSize, kBlockSize, 0, kargs);
-    };
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+struct grouped_infer_causalmask_attnbias_dispatched {
+  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
+
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaPipelineProblemTemp =
+      ck::tile_program::block::BlockFmhaPipelineProblem<
+          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          FmhaFwdShape<HDim>,
+          true, // kIsGroupMode
+          FmhaMask,
+          FmhaTraits>;
+
+  static void Run(GroupedForwardParams& param, hipStream_t stream) {
+    const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+    BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+
+      using FmhaMask = ck::tile_program::block::
+          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+      constexpr ck::index_t occupancy =
+          (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
+
+      constexpr bool kPadSeqLenQ = true;
+      constexpr bool kPadSeqLenK = true;
+
+      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+
+      if constexpr (HDim == 256) {
+        BOOL_SWITCH_2(
+            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  false, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                      FmhaPipelineProblem>;
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      } else {
+        BOOL_SWITCH_2(
+            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  false, // kStoreLSE
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                      FmhaPipelineProblem>;
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      };
+    });
+  };
+
+  template <typename FmhaKernel>
+  static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaKernel::MakeKargs(
+          param.q_ptr,
+          param.k_ptr,
+          param.v_ptr,
+          param.attn_bias_ptr,
+          nullptr, // lse_ptr
+          param.out_ptr,
+          param.seqstart_q_dev_ptr,
+          param.seqstart_k_dev_ptr,
+          param.seqlen_k_dev_ptr,
+          param.K, // hdim_q
+          param.Kv, // hdim_v
+          param.Hq / param.Hkv, // nhead_ratio_qk
+          param.scale,
+          param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
+          param.k_strides[0],
+          param.v_strides[0],
+          param.attn_bias_strides[2],
+          param.out_strides[0],
+          param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
+          param.k_strides[1],
+          param.v_strides[1],
+          param.attn_bias_strides[1],
+          0, // nhead_stride_lse
+          param.out_strides[1],
+          static_cast<CausalMaskType>(param.custom_mask_type),
+          param.window_size);
+    }();
+
+    dim3 kGridSize = FmhaKernel::GridSize(
+        param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
+    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  };
 };
 
-template <typename scalar_t, bool has_causal_mask, bool has_attn_bias, ck::index_t HDim>
-void run_grouped_infer_causalmask_attnbias_dispatched(GroupedForwardParams& param,
-                                                      hipStream_t stream)
-{
-    grouped_infer_causalmask_attnbias_dispatched<scalar_t, has_causal_mask, has_attn_bias, HDim>::
-        Run(param, stream);
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t HDim>
+void run_grouped_infer_causalmask_attnbias_dispatched(
+    GroupedForwardParams& param,
+    hipStream_t stream) {
+  grouped_infer_causalmask_attnbias_dispatched<
+      scalar_t,
+      has_causal_mask,
+      has_attn_bias,
+      HDim>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index 5402ac327..ce95de00c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -49,22 +49,23 @@ extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                 false,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t,
-                                                                 true,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_grouped_infer_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_grouped_infer_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 17623121b..830176e68 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -49,22 +49,23 @@ extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream)
-{
-    BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-        FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
-            if(param.custom_mask_type == 0)
-                run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t,
-                                                                 false,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else if(param.custom_mask_type == 1 || param.custom_mask_type == 2)
-                run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t,
-                                                                 true,
-                                                                 HAS_ATTN_BIAS,
-                                                                 HDim>(param, stream);
-            else
-                throw std::runtime_error("Invalid custom_mask_type value");
-        });
+void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+      if (param.custom_mask_type == 0)
+        run_grouped_infer_causalmask_attnbias_dispatched<
+            ck::half_t,
+            false,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_grouped_infer_causalmask_attnbias_dispatched<
+            ck::half_t,
+            true,
+            HAS_ATTN_BIAS,
+            HDim>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
     });
+  });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 880434cf4..5d2c232ba 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -9,213 +9,207 @@
 #include <array>
 #include <cstdint>
 
-struct BatchedInferParams
-{
-    int B;   // batch size
-    int M;   // seq_len for Query
-    int N;   // seq_len for Key and Value
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    float scale;
-    bool has_attn_bias;
-
-    // BMHK mode strides
-    std::array<int, 4> q_strides;
-    std::array<int, 4> k_strides;
-    std::array<int, 4> v_strides;
-    std::array<int, 4> out_strides;
-    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    const void* attn_bias_ptr;
-
-    int custom_mask_type;
-    int window_size; // local-attention
-
-    void* out_ptr;
+struct BatchedInferParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+
+  // BMHK mode strides
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> out_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+
+  int custom_mask_type;
+  int window_size; // local-attention
+
+  void* out_ptr;
 };
 
-struct BatchedForwardParams : public BatchedInferParams
-{
-    bool use_dropout;
-    bool compute_logsumexp;
+struct BatchedForwardParams : public BatchedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
 
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
-    // completely contiguous
-    void* logsumexp_ptr;
+  // completely contiguous
+  void* logsumexp_ptr;
 };
 
-struct GroupedInferParams
-{
-    int num_batches;
-    int M;   // total seq_len for all queries in the batch
-    int N;   // total seq_len for all keys/values in the batch
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
+struct GroupedInferParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
 
-    int max_seqlen_q;
+  int max_seqlen_q;
 
-    void* seqstart_q_dev_ptr;
-    void* seqstart_k_dev_ptr;
-    void* seqlen_k_dev_ptr;
+  void* seqstart_q_dev_ptr;
+  void* seqstart_k_dev_ptr;
+  void* seqlen_k_dev_ptr;
 
-    float scale;
-    bool has_attn_bias;
+  float scale;
+  bool has_attn_bias;
 
-    // MHK mode strides, last-dim contiguous
-    std::array<int, 3> q_strides;
-    std::array<int, 3> k_strides;
-    std::array<int, 3> v_strides;
-    std::array<int, 3> out_strides;
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
 
-    // 4d tensor view [B, H, M, N]
-    std::array<int, 4> attn_bias_strides;
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
 
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    const void* attn_bias_ptr;
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
 
-    int custom_mask_type;
-    int window_size; // local-attention
+  int custom_mask_type;
+  int window_size; // local-attention
 
-    void* out_ptr;
+  void* out_ptr;
 };
 
-struct GroupedForwardParams : public GroupedInferParams
-{
-    bool use_dropout;
-    bool compute_logsumexp;
+struct GroupedForwardParams : public GroupedInferParams {
+  bool use_dropout;
+  bool compute_logsumexp;
 
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
 
-    // completely contiguous
-    void* logsumexp_ptr;
+  // completely contiguous
+  void* logsumexp_ptr;
 
-    // TODO: need remove this after dev-op fix
-    std::vector<void*> randvals_ptrs;
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
 };
 
-struct BatchedBackwardParams
-{
-    int B;   // batch size
-    int M;   // seq_len for Query
-    int N;   // seq_len for Key and Value
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    float scale;
-    bool has_attn_bias;
-    bool bias_has_grad;
-
-    bool use_fp32_qkv_grad;
-    bool is_mqa_gqa;
-
-    // BMHK mode strides, last-dim contiguous
-    std::array<int, 4> q_strides;
-    std::array<int, 4> k_strides;
-    std::array<int, 4> v_strides;
-    std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-    std::array<int, 4> out_strides;
-
-    std::array<int, 4> tmp_grad_k_strides;
-    std::array<int, 4> tmp_grad_v_strides;
-
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    const void* attn_bias_ptr;
-    const void* grad_out_ptr;
-    const void* out_ptr;
-
-    uint8_t custom_mask_type;
-
-    void* grad_q_ptr;
-    void* grad_k_ptr;
-    void* grad_v_ptr;
-    void* grad_bias_ptr;
-
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
-
-    // BHM mode lengths, completely contiguous
-    const void* logsumexp_ptr;
+struct BatchedBackwardParams {
+  int B; // batch size
+  int M; // seq_len for Query
+  int N; // seq_len for Key and Value
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  float scale;
+  bool has_attn_bias;
+  bool bias_has_grad;
+
+  bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
+
+  // BMHK mode strides, last-dim contiguous
+  std::array<int, 4> q_strides;
+  std::array<int, 4> k_strides;
+  std::array<int, 4> v_strides;
+  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
+  std::array<int, 4> out_strides;
+
+  std::array<int, 4> tmp_grad_k_strides;
+  std::array<int, 4> tmp_grad_v_strides;
+
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+  const void* grad_out_ptr;
+  const void* out_ptr;
+
+  uint8_t custom_mask_type;
+
+  void* grad_q_ptr;
+  void* grad_k_ptr;
+  void* grad_v_ptr;
+  void* grad_bias_ptr;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // BHM mode lengths, completely contiguous
+  const void* logsumexp_ptr;
 };
 
-struct GroupedBackwardParams
-{
-    int num_batches;
-    int M;   // total seq_len for all queries in the batch
-    int N;   // total seq_len for all keys/values in the batch
-    int Hq;  // number of heads for Query
-    int Hkv; // number of heads for Key and Value
-    int K;   // embed_dim for Query and Key
-    int Kv;  // embed_dim for Value
-
-    int max_seqlen_q;
-
-    std::vector<int> host_seqstart_q;
-    std::vector<int> host_seqstart_k;
-    std::vector<int> host_seqlen_k;
-
-    float scale;
-    bool has_attn_bias;
-    bool bias_has_grad;
-
-    bool use_fp32_qkv_grad;
-    bool is_mqa_gqa;
-
-    // MHK mode strides, last-dim contiguous
-    std::array<int, 3> q_strides;
-    std::array<int, 3> k_strides;
-    std::array<int, 3> v_strides;
-    std::array<int, 3> out_strides;
-    // 4d tensor view [B, H, M, N]
-    std::array<int, 4> attn_bias_strides;
-
-    std::array<int, 3> tmp_grad_k_strides;
-    std::array<int, 3> tmp_grad_v_strides;
-
-    std::vector<const void*> q_ptrs;
-    std::vector<const void*> k_ptrs;
-    std::vector<const void*> v_ptrs;
-    std::vector<const void*> attn_bias_ptrs;
-    std::vector<const void*> grad_out_ptrs;
-    std::vector<const void*> out_ptrs;
-
-    // used by the light_v2 kernel
-    // TODO use these as workspace
-    std::vector<void*> ydotdy_ptrs;
-
-    uint8_t custom_mask_type;
-
-    std::vector<void*> grad_q_ptrs;
-    std::vector<void*> grad_k_ptrs;
-    std::vector<void*> grad_v_ptrs;
-    std::vector<void*> grad_bias_ptrs;
-
-    float dropout_prob;
-    int64_t philox_seed;
-    int64_t philox_offset;
-
-    // BHM mode lengths, completely contiguous
-    std::vector<const void*> logsumexp_ptrs;
-
-    // TODO: need remove this after dev-op fix
-    std::vector<void*> randvals_ptrs;
+struct GroupedBackwardParams {
+  int num_batches;
+  int M; // total seq_len for all queries in the batch
+  int N; // total seq_len for all keys/values in the batch
+  int Hq; // number of heads for Query
+  int Hkv; // number of heads for Key and Value
+  int K; // embed_dim for Query and Key
+  int Kv; // embed_dim for Value
+
+  int max_seqlen_q;
+
+  std::vector<int> host_seqstart_q;
+  std::vector<int> host_seqstart_k;
+  std::vector<int> host_seqlen_k;
+
+  float scale;
+  bool has_attn_bias;
+  bool bias_has_grad;
+
+  bool use_fp32_qkv_grad;
+  bool is_mqa_gqa;
+
+  // MHK mode strides, last-dim contiguous
+  std::array<int, 3> q_strides;
+  std::array<int, 3> k_strides;
+  std::array<int, 3> v_strides;
+  std::array<int, 3> out_strides;
+  // 4d tensor view [B, H, M, N]
+  std::array<int, 4> attn_bias_strides;
+
+  std::array<int, 3> tmp_grad_k_strides;
+  std::array<int, 3> tmp_grad_v_strides;
+
+  std::vector<const void*> q_ptrs;
+  std::vector<const void*> k_ptrs;
+  std::vector<const void*> v_ptrs;
+  std::vector<const void*> attn_bias_ptrs;
+  std::vector<const void*> grad_out_ptrs;
+  std::vector<const void*> out_ptrs;
+
+  // used by the light_v2 kernel
+  // TODO use these as workspace
+  std::vector<void*> ydotdy_ptrs;
+
+  uint8_t custom_mask_type;
+
+  std::vector<void*> grad_q_ptrs;
+  std::vector<void*> grad_k_ptrs;
+  std::vector<void*> grad_v_ptrs;
+  std::vector<void*> grad_bias_ptrs;
+
+  float dropout_prob;
+  int64_t philox_seed;
+  int64_t philox_offset;
+
+  // BHM mode lengths, completely contiguous
+  std::vector<const void*> logsumexp_ptrs;
+
+  // TODO: need remove this after dev-op fix
+  std::vector<void*> randvals_ptrs;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
index 6043ebcd0..6de737c80 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
@@ -9,29 +9,20 @@
 #include <stdexcept>
 
 #define FMHA_FWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
-    [&] {                                                              \
-        if(HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32)                         \
-        {                                                              \
-            constexpr ck::index_t CONST_NAME = 32;                     \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else if(HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64)                    \
-        {                                                              \
-            constexpr ck::index_t CONST_NAME = 64;                     \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else if(HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128)                  \
-        {                                                              \
-            constexpr ck::index_t CONST_NAME = 128;                    \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else if(HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256)                  \
-        {                                                              \
-            constexpr ck::index_t CONST_NAME = 256;                    \
-            __VA_ARGS__();                                             \
-        }                                                              \
-        else                                                           \
-        {                                                              \
-            throw std::runtime_error("Head-dim sizes not supported!"); \
-        }                                                              \
-    }()
+  [&] {                                                                \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
+      constexpr ck::index_t CONST_NAME = 32;                           \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {                   \
+      constexpr ck::index_t CONST_NAME = 64;                           \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
+      constexpr ck::index_t CONST_NAME = 128;                          \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256) {                 \
+      constexpr ck::index_t CONST_NAME = 256;                          \
+      __VA_ARGS__();                                                   \
+    } else {                                                           \
+      throw std::runtime_error("Head-dim sizes not supported!");       \
+    }                                                                  \
+  }()
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
index 36e9cf24d..509f83827 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
index a44c7f83a..239204ad2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
index 2c6fa3f58..06c4370ff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
index 8ea38c8b6..c5263f167 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
index 8dfa5aaae..706bf4146 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
index fbbbc2d61..91aac31d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
index 66a2acb12..c882648e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
index 59dcd373b..5ce517a80 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
index 29f9ea02d..983538314 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
index 4bf813296..3202979ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
index ec12b66c7..68b4d782a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
index 947faaa83..a7786f596 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
index a1e22812a..8205af6fa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
index de7ee388b..b69fdda9b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
index de45cee54..786b294ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
index d0e3c83c8..8bebad6d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
index 0a125b480..47bfbb6ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
index 511598a23..b3efcb0f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
index bb6ba7b58..366a1be0b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
index e260e288c..a1b19853c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -9,5 +9,8 @@
 
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
index 8f7501252..c764522f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
index 47cb68b98..53e93ab40 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
index 34b331814..135932bb6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
index 9a46d6678..b36435a56 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_backward.h"
 
-template void run_batched_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
-    BatchedBackwardParams& param, hipStream_t stream);
+template void run_batched_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
index 0027e6fa6..61a34f3bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
-                                                                        hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
index 01b4ab6a1..99ef697c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
index fee6af685..27d8f3389 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
-                                                                        hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
index 3b22467b8..9b81f64c1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
index 0964fea9a..014b077e3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
-                                                                        hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
index 9ddde1484..9a5b10848 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
index 4e47a02b8..52a38e71f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
index a99e2cf17..b96463d83 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
index b0617fe73..dd4a8d4e2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
index d00e4e2ac..6fd666459 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
index 6a2215ae0..e2c25b131 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
index 43dc7c78f..daee90785 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_batched_forward.h"
 
-template void
-run_batched_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_batched_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
index 11c575371..fae4e95db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
index 6ed03ba3b..3ea61a46a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
index cbb2f1e37..aa01129f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
index e53d44ff4..1596dbea9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
index 96454b7d8..d5a27c62a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(BatchedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
index ecfd4bd2e..b47dcb485 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
index b73d06a5c..2144a980e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
index 3ebf195d7..961a5b8f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
index 1f56500ce..308adb597 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
index 2cbb237cc..dd24e182b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
index 441520157..590d032f1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(BatchedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
index 5e9d21dac..1440164c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_batched_infer.h"
 
-template void
-run_batched_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(BatchedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_batched_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
index 517b6ab08..ced06186a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
index eeb4ba125..9f61adfc9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
index 179dadebc..2d4b51888 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
index 3b604cd00..a49a8704c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
index 07ec9e671..c2279d835 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
index b23b68e21..382bf0143 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
index 2c5cf0189..1b7549e3e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
index 3dbf05b04..f06694955 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
index 765eb7fd2..3a86c12f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
index 9eae79997..c287a283d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
index 2d85adcdc..6b06378dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
index 325adcf28..13d1bc553 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
index 23c7f7360..71cdf5b35 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
index f5095f9e0..792f55e4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
index d893d066c..5776e856d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
index b81c731c6..d3f2eec10 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 0, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
index 5d79dc7a9..27962589e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
index 8ca3fc15b..fa837a65c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
index 28cfd91f0..7a83d4655 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
index e7974599b..807d23156 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 1, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
index f7c6bab6b..508d01882 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
index 389b8ef6b..5954578f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, false, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
index cf6edccb5..78482f931 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, false>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
index fc2e60a47..f38ea2ab2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
@@ -7,5 +7,8 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_backward.h"
 
-template void run_grouped_backward_masktype_attnbias_dispatched<ck::half_t, 2, true, true>(
-    GroupedBackwardParams& param, hipStream_t stream);
+template void run_grouped_backward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true,
+    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
index 4d473f7b9..3f6f0025b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
-                                                                        hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
index 4b64703b2..22918197f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
index ed5a11c66..fffe1b188 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
-                                                                        hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
index 4ecf75691..b6020c099 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
index af22c6c13..16f780c9e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
-                                                                        hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
index 2aa5b9431..28c1f0832 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
index efaa2ee52..428b1b9ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
index 7394b8b72..442e54a28 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
index 3b7732cb0..a8520501d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
index a4db70fcf..7a6075ab5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
index c19f683b6..c93563491 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
-                                                                       hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
index 2e10db88a..dc1fbc96b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
@@ -7,6 +7,7 @@
 #include <ck/ck.hpp>
 #include "ck_fmha_grouped_forward.h"
 
-template void
-run_grouped_forward_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_grouped_forward_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
index 3c012adbf..62ff93032 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
index f19c5a4e9..e3d2da2cc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 0, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
index b12476dad..4d1f3c7f0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
index ab0141e0d..170e8a56f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 1, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
index 546074138..b615233aa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, false>(GroupedForwardParams& param,
-                                                                      hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
index 9b65ff186..2f1227b87 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::bhalf_t, 2, true>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::bhalf_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
index 3e8a0eb75..bb20cf780 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
index 92879082c..509986e1c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 0, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    0,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
index 37137dc97..a53a0f485 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
index 3ea5affe8..b35c58526 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 1, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    1,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
index 33f2bc7f9..53e30115a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, false>(GroupedForwardParams& param,
-                                                                     hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
index 27eea7bac..d25650c8e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
@@ -8,6 +8,7 @@
 
 #include "ck_fmha_grouped_infer.h"
 
-template void
-run_grouped_infer_masktype_attnbias_dispatched<ck::half_t, 2, true>(GroupedForwardParams& param,
-                                                                    hipStream_t stream);
+template void run_grouped_infer_masktype_attnbias_dispatched<
+    ck::half_t,
+    2,
+    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
index ab8b8f270..1482336ab 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
index bff652986..f1ba383da 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
index 7c7e53df5..3b9f3026b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
index a2cefd689..c38716ce2 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
index 4bce63f3d..ed91bf4bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
index fd9fee064..eca859229 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
index 8a4583c6f..ec258aeda 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
index e3ddab117..feb78a115 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
index 2726966fa..59c6550f4 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
index 5158b5c44..a30775e77 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
index 25a8f9316..594c4a68c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
index b174cd641..39ea42913 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
index 941488b93..6ea24c5ca 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
index 986dfe9df..a675c95be 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
index d1590b38d..dc4bb0ea0 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
index b245f5715..334eb891f 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
index 2bf4db3f8..606d9db86 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
index 41029c7dc..7dc799605 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
index c0df0271a..566b1bf6a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
index 52b129eb2..3b72b97d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
index b8a496fed..c2c124dbe 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
index 53a9328c6..1cdd7e078 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
index 5ee4e29f4..50ea22659 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
index 3d9791d33..58ac17e39 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
index ef0eae81d..070ed44ef 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
index a5870aacf..e535f40f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
index a8cc8231a..a24884bff 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
index c7b13e92e..524e1ab86 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
index 4911aba00..58013ca64 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
index 42e4a7a93..fcb6d8b54 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
index d43b65227..38e7fb026 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
index bce8348c6..1c0b277b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
index 17c5ab864..b95c3fdb9 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
index 38b8aa3b7..dce1496ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
index f2d976897..fa81f80c1 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
index a8d2b933a..fd118cd22 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
index bcee71741..4772d56ab 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
index 485ff4b64..b95f0d5ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
index 496c34c61..7fe7a3f69 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
index f52e8fcd8..3ae773369 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
index 2b593af2b..9757278db 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
index 54871d2ed..6caed9563 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
index 3f7d86019..4dfaa3678 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
index 400f0aaa4..fa0416c5c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
index f9063434c..ecc90b366 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
index 31831836f..dff3a317a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
index 4866c0148..fa084941b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
index c87e7d2c2..d0ece69d0 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
index d2b894e6b..8e9843a5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
index a55ac98be..20580c11e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
index ab5c8bb2c..4e4d90f82 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
index 282750da4..b36864534 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
index 17d3a203b..2f16639ed 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
index e4e7645e8..41f8249e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
index 1b3a9a7c8..bfdf01423 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
index 64c00b096..550831036 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
index 9d24c03b9..8caa116d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
index ab81e906d..0468ba8af 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
index 5417efb52..cd8077b51 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
index 3b55e45b8..ed22d8fc5 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
index e7f76cd58..1ae833e7d 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
index 2d5edfc0f..bb9a177b5 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
index ff21e5051..88945231f 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
index 316457d7b..330e0dfbc 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
+template void run_batched_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
index ede42cd70..d278e2b0b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
index 4452ef80e..2bd6d042a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
index 7de8d370c..732381a8a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
index 66f084dc4..352d94bb4 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
index 894b979d0..ebd002ef4 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
index 53346a196..844444629 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
index fc0329da0..52b5cb895 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
index 4e169225d..35a058368 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
index 19e997418..697ce6345 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
index 86cb616c3..cc24c03c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
index f9b6f38eb..e0d0f9e03 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
index 64433cc55..c658c89f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
index b2df4367b..785e62d78 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
index de62061b5..83001360b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
index 604a12985..ed45ccf36 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
index 985fe0a74..f0b639ef6 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
index 7c905fcc1..08bf47cd5 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
index bcd9cbf9a..8c4c0c440 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
index 0be43523f..2ff6c73e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
index fd490972a..b5ec1a781 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
index 0722ee7df..c7ba7f09e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
index 9d6178ab8..577f1a1ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
index db9e4fbd5..cd1bda5d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
index ae0842444..caa6f0d16 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
index fe1c3f8c0..e0349f471 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
index d246e0dca..58d7cec79 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
index 611d7bfb8..a9a2a191e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
index 2b9d7a2c6..8eb2447a8 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
index 165e61310..c83769098 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
index 5496abe4c..fe21d52fe 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
index deb14598a..6bedae2d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
index f803b0f05..a45a99b80 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_forward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
index 66d6ce7de..54cbec7ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
index 819794d6f..12b67ea45 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
index fa94726d7..d6c6c1a5d 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
index d8f96bdb9..c74dbe200 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
index c42eade65..35b522a6a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
index 357eb57b1..4fb8bdd59 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
index 6ad131cd6..1d2cd2656 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
index f6131197a..2ccb25769 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
index 15c6d599a..2f8ea04e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
index 7f7229c8b..f10999c7c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
index bdc6996c2..f87772024 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
index 15ac95e27..d2b85141c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
index 4bd616c5d..fe5b8db51 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
index 05e935716..593d4fda1 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
index a72f0e811..941dcd50e 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
index 99e86651c..82183313a 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
index 18e2f8bac..c3f52f074 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
index 5bdf3d87e..5d4882d2b 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
index 584be8667..6e0b2914d 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
index 70b023ba0..b49d09908 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
index 082912ca6..1741265b2 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
index 15ccf9a44..4197ba831 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
index dbfcfa438..88ac7b42c 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
index c55043820..c717aed64 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
index 616c49912..5449dfd32 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
index 895740585..73bf0e6d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
index 558f63474..55c80b4c9 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
index 000c3f3ca..76cafe4e0 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
index 39f45768e..8fe0d31e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
index 6028a16df..aeff1e2c6 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
index 105ee9025..f8fed7106 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
index f7f86a773..ec5f029d7 100644
--- a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
@@ -8,5 +8,8 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
+template void run_grouped_infer_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);

From 52ae8a31e92d67af7614ee3496e232db285b5f27 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 4 Feb 2024 18:27:32 +0000
Subject: [PATCH 421/641] Synchronize to latest ck-tiled commit

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 3bda955fe..03d1d1ad9 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 3bda955fe6ca92cdd29691783ebb772ac13c857c
+Subproject commit 03d1d1ad9e0cc3c8e5d800d106bbdebe877e6e88

From 7dd3aeef885ddab4b8f6a55b5b54f9132b25b991 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 4 Feb 2024 21:29:18 +0000
Subject: [PATCH 422/641] Add checking of IS_CK_TILED into some testing scripts

---
 tests/test_mem_eff_attention.py | 18 +++++++++++-------
 xformers/ops/fmha/ck.py         |  4 ++++
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index aee582c38..058d18d89 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -17,7 +17,6 @@
 import xformers.ops
 from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
-from xformers.ops.common import get_xformers_operator
 from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
 from xformers.ops.fmha.common import AttentionOpBase
 from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
@@ -711,12 +710,8 @@ def test_mqa_forward(
 
     device = torch.device("cuda")
 
-    ### ck_check_op is temporarily used to check ck-tiled availability
-    ck_check_op = get_xformers_operator("is_ck_tiled_used")
-    use_ck_tiled = ck_check_op()
-
-    if not use_ck_tiled:
-        pytest.skip("mqa/gqa is only supported with ck-tiled")
+    if op is fmha.ck.FwOp and not op.IS_CK_TILED:
+        pytest.skip("mqa/gqa is only supported with ck-tiled fmha")
 
     torch.manual_seed(B * M + N * K + Hq*Hkv + Kv)
 
@@ -813,6 +808,10 @@ def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
         k,
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
+
+    if op is fmha.ck.FwOp and op.IS_CK_TILED:
+        pytest.skip("logsumexp is not yet supported by ck-tiled fmha!")
+
     query, key, value, attn_bias = create_tensors(
         *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
     )
@@ -1452,6 +1451,8 @@ def test_grad_checkpointing(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     if op is fmha.triton.FwOp:
         pytest.skip("Triton Flash Attention 2 doesn't support backward pass yet")
+    if op is fmha.ck.FwOp and op.IS_CK_TILED:
+        pytest.skip("ck-tiled FMHA doesn't supported backward pass yet")
     bias_type = None
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
         op,
@@ -2469,6 +2470,9 @@ def test_empty_tensors_empty_query(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
+    if op is fmha.ck.FwOp and op.IS_CK_TILED:
+        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
+
     query = query[:, :0]
     query.requires_grad_(True)
     key.requires_grad_(True)
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index ff899dc53..b6faf83c9 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -211,6 +211,8 @@ class FwOp(AttentionFwOpBase):
         256,  # 64x128 with accumulation in gmem
     ]
 
+    IS_CK_TILED = is_ck_tiled()
+
     @classmethod
     def apply(
         cls, inp: Inputs, needs_gradient: bool
@@ -397,6 +399,8 @@ class BwOp(AttentionBwOpBase):
         256,  # 64x128 with accumulation in gmem
     ]
 
+    IS_CK_TILED = is_ck_tiled()
+
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(BwOp, cls).not_supported_reasons(d)

From 5eb1235f69cf571b4b086b1ac8cea2f66dac2506 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Feb 2024 17:56:43 +0000
Subject: [PATCH 423/641] Update to test_mem_eff_attention.py and ck.py

---
 tests/test_mem_eff_attention.py | 72 +++++++++++++++++++++++++++++++--
 xformers/ops/fmha/ck.py         |  5 ++-
 xformers/ops/fmha/dispatch.py   |  6 +--
 3 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 058d18d89..ee59e7295 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -5,6 +5,7 @@
 
 import math
 import random
+import sys
 from functools import partial
 from typing import List, Optional, Sequence, Tuple, Type, TypeVar
 
@@ -615,8 +616,8 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if torch.version.hip and op is fmha.triton_splitk.FwOp:
-        pytest.skip("trition_splitk Fwd is not supported on ROCm!")
+    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
 
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
@@ -812,6 +813,9 @@ def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
     if op is fmha.ck.FwOp and op.IS_CK_TILED:
         pytest.skip("logsumexp is not yet supported by ck-tiled fmha!")
 
+    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     query, key, value, attn_bias = create_tensors(
         *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
     )
@@ -1223,6 +1227,9 @@ def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size,
     op_fw = fmha.small_k.FwOp
     op_bw = fmha.small_k.BwOp
 
+    if torch.version.hip:
+        pytest.skip("fmha.small_k is not supported on ROCM")
+
     scale = 3
     query = torch.randn((batch_size, q_len, k_len), device=device) * scale
     key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
@@ -1310,6 +1317,9 @@ def test_cuda_streams(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     if device != "cuda":
         pytest.skip("Not CUDA")
+    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     bias_type = None
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
         op,
@@ -1453,6 +1463,9 @@ def test_grad_checkpointing(
         pytest.skip("Triton Flash Attention 2 doesn't support backward pass yet")
     if op is fmha.ck.FwOp and op.IS_CK_TILED:
         pytest.skip("ck-tiled FMHA doesn't supported backward pass yet")
+    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     bias_type = None
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
         op,
@@ -1524,6 +1537,10 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
     q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
         0, 3, 1, 2
     )
+
+    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     try:
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
     except ValueError as e:
@@ -1539,6 +1556,10 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
 )
 def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
     q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
+
+    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     try:
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
     except ValueError as e:
@@ -1988,6 +2009,9 @@ def test_triton_splitk_decoder(
     if dequant:
         pytest.skip("dequant is not supported")
 
+    if (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     # We omit dequant with f16: it needs a very high tol
     test_decoder(
         op,
@@ -2096,6 +2120,8 @@ def test_f16_biasf32(self) -> None:
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
 
     def test_f32_biasf16(self) -> None:
+        if torch.version.hip:
+            pytest.skip("float32 is not supported by ck.FwOp/ck.BwOp currently, skipped")
         q, k, v, bias = self.create_tensors(torch.float32)
         fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
         bias = bias.to(torch.float16)
@@ -2104,7 +2130,10 @@ def test_f32_biasf16(self) -> None:
 
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
     def test_wrong_alignment(self, dtype) -> None:
-        op = fmha.cutlass.FwOp
+        op = fmha.cutlass.FwOp if torch.version.cuda else fmha.ck.FwOp
+        if torch.version.hip and dtype is torch.float32:
+            pytest.skip("float32 is not supported by fmha.ck.FwOp!")
+
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
         try:
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
@@ -2168,6 +2197,9 @@ def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
     if sm < 80 and dtype_str == "bf16":
         return
 
+    if torch.version.hip:
+        pytest.skip("_has_cutlassF_kernel is not supported on ROCM")
+
     for k in [16, 32, 64, 128, 256]:
         assert torch.ops.xformers._has_cutlassF_kernel_for(
             dtype, sm, shmem_kbytes * 1024, k
@@ -2288,6 +2320,9 @@ def test_forward_gqa_one_group(opFW):
     k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
     v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
 
+    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     supported = opFW.supports(fmha.Inputs(q, k, v))
     if not supported:
         supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
@@ -2306,6 +2341,10 @@ def test_forward_gqa_one_group(opFW):
 @sm80_or_better_only
 def test_flash_gqa_wrong_strides() -> None:
     op = (fmha.flash.FwOp, None)
+
+    if torch.version.hip:
+        pytest.skip("flash operation is not supported on ROCM!")
+
     device = "cuda"
     B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
     q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
@@ -2344,6 +2383,8 @@ def _dispatches_to_flash_decoding(q, kv):
 
 
 def test_dispatch_decoding_bmhk() -> None:
+    if torch.version.hip:
+        pytest.skip("dispatch testing currently ignored on ROCM")
     assert not _dispatches_to_splitK(
         torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
     ), "Should not use SplitK with 1 head (no tensorcores)"
@@ -2366,6 +2407,8 @@ def test_dispatch_decoding_bmhk() -> None:
 
 
 def test_dispatch_decoding_bmghk() -> None:
+    if torch.version.hip:
+        pytest.skip("dispatch testing currently ignored on ROCM")
     assert not _dispatches_to_splitK(
         torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
     ), "Should not use SplitK with 1 head (no tensorcores)"
@@ -2448,6 +2491,9 @@ def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
     k = k.expand(-1, -1, H, -1)
     v = v.expand(-1, -1, H, -1)
 
+    if (sys.version_info.major, sys.version_info.minor) <= (3, 8): 
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     if not op.supports(fmha.Inputs(q, k, v)):
         pytest.skip("not supported")
     out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
@@ -2470,9 +2516,12 @@ def test_empty_tensors_empty_query(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if op is fmha.ck.FwOp and op.IS_CK_TILED:
+    if opFW is fmha.ck.FwOp and opFW.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
+    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     query = query[:, :0]
     query.requires_grad_(True)
     key.requires_grad_(True)
@@ -2495,6 +2544,12 @@ def test_empty_tensors_empty_kv(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
+    if opFW is fmha.ck.FwOp and opFW.IS_CK_TILED:
+        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
+
+    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     key = key[:, :0]
     value = value[:, :0]
     query.requires_grad_(True)
@@ -2517,6 +2572,12 @@ def test_empty_tensors_empty_b(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
+    if opFW is fmha.ck.FwOp and opFW.IS_CK_TILED:
+        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
+
+    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+        pytest.skip("triton_splitk requires python 3.9 or above!")
+
     query, key, value = query[:0], key[:0], value[:0]
     query.requires_grad_(True)
     key.requires_grad_(True)
@@ -2589,6 +2650,9 @@ def test_cutlassB_iter_order(
         the same block of dQ
     .. and we test this across variable causal masks+local attention combinations
     """
+    if torch.version.hip:
+        pytest.skip("this test is only for cutlass/cuda environment")
+
     if (
         window_size > 0
         and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index b6faf83c9..000a07e56 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -337,6 +337,9 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         check_lastdim_alignment_stride1(reasons, "value", d.value, matmul_alignment_mn)
         _check_bias_alignment(reasons, d.attn_bias)
         _check_large_shapes(reasons, d)
+        requires_grad = d.query.requires_grad or d.key.requires_grad or d.value.requires_grad
+        if is_ck_tiled() and requires_grad:
+            reasons.append("Gradience is currently not supported by ck-tiled!")
         return reasons
 
     @classmethod
@@ -433,7 +436,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                 )
         _check_large_shapes(reasons, d)
         if is_ck_tiled():
-            reasons.append("Backward is currently not completely supported by ck-tiled!")
+            reasons.append("Backward is currently not supported by ck-tiled!")
         return reasons
 
     @classmethod
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 7113855cb..0acb7eb35 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -75,15 +75,15 @@ def _dispatch_fw_priority_list(
               cutlass.FwOp,
               small_k.FwOp,
            ])
+        if _is_cutlass_fwd_faster_than_flash(inp):
+            priority_list_ops.remove(cutlass.FwOp)
+            priority_list_ops.appendleft(cutlass.FwOp)
     else:
         priority_list_ops = deque(
            [
               triton.FwOp,
               ck.FwOp,
            ])
-    if _is_cutlass_fwd_faster_than_flash(inp):
-        priority_list_ops.remove(cutlass.FwOp)
-        priority_list_ops.appendleft(cutlass.FwOp)
     if _is_triton_fwd_fastest(inp):
         priority_list_ops.remove(triton.FwOp)
         priority_list_ops.appendleft(triton.FwOp)

From 58e6101f2e33338d433151a7a1b88ba496bef5a0 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Feb 2024 18:54:23 +0000
Subject: [PATCH 424/641] Building xformers using ck-tiled as default

---
 setup.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/setup.py b/setup.py
index 84056c6e9..f56dbeca7 100644
--- a/setup.py
+++ b/setup.py
@@ -241,14 +241,7 @@ def get_extensions():
         *glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_splitk.cpp"), recursive=False)
     ]
 
-    if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_infer_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_forward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_forward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
-    else:
+    if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_backward_generic.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"), recursive=False)
@@ -259,7 +252,14 @@ def get_extensions():
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_backward_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_backward_*.cpp"), recursive=False)
         source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances", "ck_fmha_*.cpp"), recursive=False)
-        
+    else:
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_forward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_forward_*.cpp"), recursive=False)
+        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
+
     source_hip += source_hip_decoder
     
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
@@ -350,15 +350,15 @@ def get_extensions():
        sources += source_hip_cu
        include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha' ]
 
-       if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
-           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include']
-       else:
+       if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
            include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include']
-           
-       if os.getenv("FORCE_CK_TILED_KERNEL", "0") == "1":
-           generator_flag = ["-DUSE_CK_TILED_KERNEL"]
        else:
+           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include']
+           
+       if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
            generator_flag = []
+       else:
+           generator_flag = ["-DUSE_CK_TILED_KERNEL"]
        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
        extra_compile_args={
             "cxx": ["-O3", "-std=c++17"] + generator_flag,

From 389dfb46045eaf7ff58496f6f04a5f0edbcba213 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 5 Feb 2024 19:27:36 +0000
Subject: [PATCH 425/641] ensure ck_decoder does not dispatch

---
 xformers/ops/fmha/ck_decoder.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index daa4689b8..3579a3f0a 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -57,6 +57,9 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             padding = attn_bias.k_seqinfo.padding
             bsz = d.key.shape[1] // padding
             num_queries = d.query.shape[1] // bsz
+            
+            if q_starts != list(range(0, 1 + bsz, num_queries)):
+                reasons.append("expect to have same num_queries in each batch")
             if bsz != len(q_starts) - 1:
                 reasons.append("empty lanes not supported yet")
 

From f8d904328f9af34b098cc8068ce578521fd6547e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Feb 2024 20:23:44 +0000
Subject: [PATCH 426/641] Add disable_on_rocm on some test scripts

---
 tests/test_attentions.py         |  8 +++++---
 tests/test_checkpoint.py         | 11 +++++++++--
 tests/test_core_attention.py     |  8 ++++++--
 tests/test_custom_ops.py         | 16 +++++++++++++---
 tests/test_mem_eff_attention.py  | 29 +++++++++--------------------
 tests/test_sparse_tensors.py     |  7 ++++---
 tests/test_swiglu.py             |  3 ++-
 tests/test_triton_blocksparse.py |  9 +++++----
 tests/test_triton_layernorm.py   |  6 ++++--
 9 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/tests/test_attentions.py b/tests/test_attentions.py
index cf70bbea7..038c55baa 100644
--- a/tests/test_attentions.py
+++ b/tests/test_attentions.py
@@ -22,6 +22,8 @@
     build_attention,
 )
 
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+
 DEVICES = (
     [torch.device("cpu")] if not torch.cuda.is_available() else [torch.device("cuda")]
 )
@@ -90,7 +92,7 @@ def noop(x):
 
     return multi_head
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("attn_dropout", [0.0, 0.3])
 @pytest.mark.parametrize("residual_dropout", [0.0, 0.1])
 @pytest.mark.parametrize("causal", [True, False])
@@ -160,7 +162,7 @@ def test_order_invariance(
             with torch.cuda.amp.autocast(enabled=True):
                 _ = multi_head(inputs, inputs_shuffled, inputs)
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("heads", [1, 4])
 @pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
 @pytest.mark.parametrize("device", DEVICES)
@@ -203,7 +205,7 @@ def test_kqv_ordering(
     res_false = multi_head(query=v, key=k, value=q)
     assert torch.allclose(res_false[0, :, :], res_false[1, :, :])
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("heads", [1, 4])
 @pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
 @pytest.mark.parametrize("device", DEVICES)
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 20ab750c9..eab74a172 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -14,6 +14,7 @@
 from xformers import checkpoint, list_operators
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
 _devices = ["cpu"]
 cuda_cap = (0, 0)
 
@@ -29,7 +30,7 @@ def _relu_policy(func, *args, **kwargs):
 def _all_policy(func, *args, **kwargs):
     return True
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("policy_fn", [None, [], _relu_policy, _all_policy])
 @pytest.mark.parametrize("input_requires_grad", [True, False])
 @pytest.mark.parametrize("device", _devices)
@@ -102,7 +103,7 @@ def test_checkpoint_with_grad(policy_fn, input_requires_grad, grad_mode):
     "op",
     [
         xformers.ops.MemoryEfficientAttentionFlashAttentionOp,
-        xformers.ops.MemoryEfficientAttentionCutlassOp,
+        xformers.ops.MemoryEfficientAttentionCutlassOp if torch.version.cuda else xformers.ops.MemoryEfficientAttentionCkOp,
     ],
 )
 def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast, op):
@@ -112,6 +113,12 @@ def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast,
     ):
         pytest.skip("skipping operator not supported in this arch")
 
+    if op is xformers.ops.MemoryEfficientAttentionFlashAttentionOp and torch.version.hip:
+        pytest.skip("FlashAttentionOp is not supported on ROCM!")
+
+    if op is xformers.ops.MemoryEfficientAttentionCkOp and op[0].IS_CK_TILED:
+        pytest.skip("Gradience is currently not supported by ck-tiled!")
+
     class Attn(nn.Module):
         def forward(self, x):
             out = xformers.ops.memory_efficient_attention(x, x, x, op=op)
diff --git a/tests/test_core_attention.py b/tests/test_core_attention.py
index 0beace442..81a403e59 100644
--- a/tests/test_core_attention.py
+++ b/tests/test_core_attention.py
@@ -21,6 +21,7 @@
     _is_triton_available() and not gpu_capabilities_older_than_70()
 )
 
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
 
 def catch_oor(fn):
     @functools.wraps(fn)
@@ -86,6 +87,7 @@ def test_core_attention_mask_types():
     r_dense_add = scaled_dot_product_attention(a, a, a, float_mask_add)
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_dense_no_mask(device):
     b, s, d = 8, 64, 32
@@ -99,6 +101,7 @@ def test_amp_attention_dense_no_mask(device):
     assert r.dtype == expected_device
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_dense(device):
     b, s, d = 8, 64, 32
@@ -114,6 +117,7 @@ def test_amp_attention_dense(device):
     assert r.dtype == expected_device
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_sparse(device):
     b, s, d = 8, 64, 32
@@ -129,7 +133,7 @@ def test_amp_attention_sparse(device):
     expected_device = torch.float32
     assert r.dtype == expected_device
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_sparsecs(device):
     b, s, d = 8, 64, 32
@@ -145,7 +149,7 @@ def test_amp_attention_sparsecs(device):
     expected_device = torch.float32
     assert r.dtype == expected_device
 
-
+@disable_on_rocm
 @pytest.mark.skipif(
     not _is_blocksparse_available, reason="Blocksparse is not available"
 )
diff --git a/tests/test_custom_ops.py b/tests/test_custom_ops.py
index bef8b4102..0a8f053d3 100644
--- a/tests/test_custom_ops.py
+++ b/tests/test_custom_ops.py
@@ -17,6 +17,8 @@
 )
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
 
 
@@ -58,6 +60,7 @@ def _baseline_sparse_bmm(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
     return torch.stack(out, dim=0)
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("is_sparse", [True, False])
 @pytest.mark.parametrize("contiguous", [True, False])
 @pytest.mark.parametrize("device", _devices)
@@ -89,6 +92,7 @@ def test_matmul_with_mask(device, contiguous, is_sparse):
     assert torch.allclose(res, res_gt)
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("is_sparse", [True, False])
 @pytest.mark.parametrize("contiguous", [True, False])
 @pytest.mark.parametrize("device", _devices)
@@ -130,7 +134,7 @@ def compute_grads(f):
     assert torch.allclose(grad_a, a.grad)
     assert torch.allclose(grad_b, b.grad)
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sddmm_sputnik(device):
     B, L, M, K = 8, 30, 16, 32
@@ -158,6 +162,7 @@ def test_sddmm_sputnik(device):
 
 
 @cuda_only
+@disable_on_rocm
 @pytest.mark.parametrize("prob", [0.5, 1])
 @pytest.mark.parametrize("K", [32, 17])
 @pytest.mark.parametrize("M", [30, 17])
@@ -188,6 +193,7 @@ def test_sddmm_csr(L, M, K, prob):
 
 
 @cuda_only
+@disable_on_rocm
 @pytest.mark.parametrize("nnz", [0, 4, 16, 20, 36])
 def test_sddmm_csr_per_nnz(nnz):
     device = torch.device("cuda")
@@ -215,6 +221,7 @@ def test_sddmm_csr_per_nnz(nnz):
 
 
 @cuda_only
+@disable_on_rocm
 @pytest.mark.parametrize("prob", [0.5, 1])
 @pytest.mark.parametrize("K", [32, 17])
 @pytest.mark.parametrize("M", [30, 17])
@@ -246,7 +253,7 @@ def test_sddmm_coo(L, M, K, prob):
     assert res.dtype == res_gt.dtype
     assert torch.allclose(res, res_gt, atol=1e-6)
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sddmm_sputnik_backward(device):
     contiguous = True
@@ -280,6 +287,7 @@ def test_sddmm_sputnik_backward(device):
     assert torch.allclose(grad_b, b.grad, atol=1e-7)
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sparse_softmax_sputnik(device):
     B, L = 8, 30
@@ -302,6 +310,7 @@ def test_sparse_softmax_sputnik(device):
     assert torch.allclose(res, res_gt)
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sparse_softmax_sputnik_backward(device):
     B, L = 8, 30
@@ -323,7 +332,7 @@ def test_sparse_softmax_sputnik_backward(device):
         grad_a, a.grad.coalesce().values().reshape_as(grad_a), atol=1e-7
     )
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_spmm_sputnik(device):
     B, L, K = 8, 30, 32
@@ -349,6 +358,7 @@ def test_spmm_sputnik(device):
     assert torch.allclose(res, res_gt)
 
 
+@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_spmm_sputnik_backward(device):
     B, M, L, K = 8, 16, 30, 32
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index ee59e7295..c86952877 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -27,6 +27,8 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
 rocm_only = pytest.mark.skipif(not torch.cuda.is_available() or not torch.version.hip, reason="requires ROCM")
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+
 compute_capability = (0, 0)
 if torch.cuda.is_available():
     compute_capability = torch.cuda.get_device_capability("cuda")
@@ -1218,6 +1220,7 @@ def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
 
 
 @cuda_only
+@disable_on_rocm
 @pytest.mark.parametrize("k_len", [32])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize("kv_len", [3 * 32])
@@ -1227,9 +1230,6 @@ def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size,
     op_fw = fmha.small_k.FwOp
     op_bw = fmha.small_k.BwOp
 
-    if torch.version.hip:
-        pytest.skip("fmha.small_k is not supported on ROCM")
-
     scale = 3
     query = torch.randn((batch_size, q_len, k_len), device=device) * scale
     key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
@@ -2119,9 +2119,8 @@ def test_f16_biasf32(self) -> None:
         with pytest.raises((ValueError, RuntimeError)):
             fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
 
+    @disable_on_rocm
     def test_f32_biasf16(self) -> None:
-        if torch.version.hip:
-            pytest.skip("float32 is not supported by ck.FwOp/ck.BwOp currently, skipped")
         q, k, v, bias = self.create_tensors(torch.float32)
         fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
         bias = bias.to(torch.float16)
@@ -2185,6 +2184,7 @@ def test_permuted_attn_bias(self) -> None:
 
 
 @cuda_only
+@disable_on_rocm
 @pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
 @pytest.mark.parametrize(
     "sm_shmem",
@@ -2197,9 +2197,6 @@ def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
     if sm < 80 and dtype_str == "bf16":
         return
 
-    if torch.version.hip:
-        pytest.skip("_has_cutlassF_kernel is not supported on ROCM")
-
     for k in [16, 32, 64, 128, 256]:
         assert torch.ops.xformers._has_cutlassF_kernel_for(
             dtype, sm, shmem_kbytes * 1024, k
@@ -2339,12 +2336,10 @@ def test_forward_gqa_one_group(opFW):
 
 
 @sm80_or_better_only
+@disable_on_rocm
 def test_flash_gqa_wrong_strides() -> None:
     op = (fmha.flash.FwOp, None)
 
-    if torch.version.hip:
-        pytest.skip("flash operation is not supported on ROCM!")
-
     device = "cuda"
     B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
     q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
@@ -2381,10 +2376,8 @@ def _dispatches_to_flash_decoding(q, kv):
         _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
     )
 
-
+@disable_on_rocm
 def test_dispatch_decoding_bmhk() -> None:
-    if torch.version.hip:
-        pytest.skip("dispatch testing currently ignored on ROCM")
     assert not _dispatches_to_splitK(
         torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
     ), "Should not use SplitK with 1 head (no tensorcores)"
@@ -2405,10 +2398,8 @@ def test_dispatch_decoding_bmhk() -> None:
         torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
     ), "Should not use SplitK if B is big"
 
-
+@disable_on_rocm
 def test_dispatch_decoding_bmghk() -> None:
-    if torch.version.hip:
-        pytest.skip("dispatch testing currently ignored on ROCM")
     assert not _dispatches_to_splitK(
         torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
     ), "Should not use SplitK with 1 head (no tensorcores)"
@@ -2600,6 +2591,7 @@ def test_local_attn_bias() -> None:
 
 
 @cuda_only
+@disable_on_rocm
 @pytest.mark.parametrize("cc", [60, 70, 80])
 @pytest.mark.parametrize("maxK", [32, 64, 128, 256])
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
@@ -2650,9 +2642,6 @@ def test_cutlassB_iter_order(
         the same block of dQ
     .. and we test this across variable causal masks+local attention combinations
     """
-    if torch.version.hip:
-        pytest.skip("this test is only for cutlass/cuda environment")
-
     if (
         window_size > 0
         and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
diff --git a/tests/test_sparse_tensors.py b/tests/test_sparse_tensors.py
index 283498738..e32cb8b37 100644
--- a/tests/test_sparse_tensors.py
+++ b/tests/test_sparse_tensors.py
@@ -15,6 +15,7 @@
 _devices = ["cpu", "cuda:0"] if torch.cuda.is_available() else ["cpu"]
 _tensor_types = [BlockSparseTensor, SparseCSRTensor]
 
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
 
 def _create_blocksparse_tensor(
     device, block_size=32, Z=8, C=2, H=64, W=64, dtype=torch.float32
@@ -100,7 +101,7 @@ def test_sparse_binary_ops(func, device):
 
     assert torch.allclose(res, res_gt)
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
 def test_masked_matmul(tensor_type, device):
@@ -152,7 +153,7 @@ def test_masked_matmul(tensor_type, device):
     assert torch.allclose(a.grad, aa.grad, atol=atol)
     assert torch.allclose(b.grad, bb.grad, atol=atol)
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
 def test_bmm(tensor_type, device):
@@ -201,7 +202,7 @@ def test_bmm(tensor_type, device):
         a_grad, a_sparse.grad.to_dense(), atol=atol
     ), f"{torch.max(torch.abs(a_grad-a_sparse.grad.to_dense()))}"
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
 def test_sparse_softmax(tensor_type, device):
diff --git a/tests/test_swiglu.py b/tests/test_swiglu.py
index f662ab4be..78112a6ed 100644
--- a/tests/test_swiglu.py
+++ b/tests/test_swiglu.py
@@ -24,6 +24,7 @@
     _is_sm80 = False
 sm80_only = pytest.mark.skipif(not _is_sm80, reason="requires sm80")
 
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
 
 def assert_allclose(
     # The output of the tested function
@@ -112,7 +113,7 @@ def generate_test_shapes():
 def create_module_cached(**kwargs) -> xsw.SwiGLU:
     return xsw.SwiGLU(**kwargs)
 
-
+@disable_on_rocm
 @pytest.mark.parametrize("autocast", [False, True], ids=["regular", "autocast"])
 @pytest.mark.parametrize("op", _ops, ids=[x.NAME for x in _ops])
 @pytest.mark.parametrize("dtype", _dtypes, ids=[str(x) for x in _dtypes])
diff --git a/tests/test_triton_blocksparse.py b/tests/test_triton_blocksparse.py
index e8e4a4dbe..5bf19aa97 100644
--- a/tests/test_triton_blocksparse.py
+++ b/tests/test_triton_blocksparse.py
@@ -14,6 +14,7 @@
 from xformers.components.attention.attention_patterns import block_sparsify_tensor
 from xformers.triton.utils import get_current_cuda_device
 
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
 
 def catch_oor(fn):
     @functools.wraps(fn)
@@ -62,7 +63,7 @@ def mask_tensor(x, mask, block, value=0):
         ret[:, h, i * block : (i + 1) * block, j * block : (j + 1) * block] = value
     return ret
 
-
+@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.skipif(
     not _triton_available or get_current_cuda_device() == "T4",
@@ -117,7 +118,7 @@ def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=32, H=2, M=512, N=384, K
     # compare
     torch.testing.assert_close(rc, tc)
 
-
+@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("BLOCK", [32, 128])
 @pytest.mark.parametrize("WIDTH", [256, 576, 1024, 1792])
@@ -147,7 +148,7 @@ def test_softmax(BLOCK, WIDTH, DTYPE):
     # compare
     torch.testing.assert_close(ry, ty)
 
-
+@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("block", [32, 43, 128])  # 16, 32,
 @pytest.mark.parametrize("dtype", [torch.float16])
@@ -220,7 +221,7 @@ def loss_fn(x):
                 msg=f"Triton grad {torch.norm(g1).item()} and torch grad {torch.norm(g2).item()}",
             )
 
-
+@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("dtype", [torch.float16])
 def test_blocksparse_attention_parity(dtype):
diff --git a/tests/test_triton_layernorm.py b/tests/test_triton_layernorm.py
index e89a40196..3946061ee 100644
--- a/tests/test_triton_layernorm.py
+++ b/tests/test_triton_layernorm.py
@@ -12,6 +12,8 @@
 
 import xformers
 
+disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+
 try:
     from xformers.triton import FusedLayerNorm
     from xformers.triton.utils import gpu_capabilities_older_than_70
@@ -34,7 +36,7 @@
     (1, 2048, 12288),
 ]
 
-
+@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton is not available")
 @pytest.mark.skipif(
     not _triton_available or gpu_capabilities_older_than_70(),
@@ -102,7 +104,7 @@ def test_layernorm_parity(shape, amp):
             + f" {torch.norm(triton_layernorm.bias.grad)}"
         )
 
-
+@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton is not available")
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.bfloat16])
 def test_no_contiguous(dtype):

From 6dae63c059a35061bd67e338c788f5067e2ce4d5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Feb 2024 23:25:16 +0000
Subject: [PATCH 427/641] Update to test_mem_eff_attention.py

---
 tests/test_mem_eff_attention.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index c86952877..183627d0b 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -2067,6 +2067,9 @@ def test_attn_bias_blockdiag_doc() -> None:
 
     from xformers.ops import fmha
 
+    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
+        pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
+
     K = 16
     dtype = torch.float16
     device = "cuda"
@@ -2507,7 +2510,7 @@ def test_empty_tensors_empty_query(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if opFW is fmha.ck.FwOp and opFW.IS_CK_TILED:
+    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
     if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
@@ -2535,9 +2538,9 @@ def test_empty_tensors_empty_kv(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if opFW is fmha.ck.FwOp and opFW.IS_CK_TILED:
+    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
-
+        
     if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
@@ -2563,7 +2566,7 @@ def test_empty_tensors_empty_b(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if opFW is fmha.ck.FwOp and opFW.IS_CK_TILED:
+    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
     if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):

From 0624c92a23d7962123cacd418d95301a54f0485e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 6 Feb 2024 01:20:45 +0000
Subject: [PATCH 428/641] apply isort

---
 tests/test_mqa_forward_ck_tiled_discarded.py     |  2 +-
 .../benchmarks/benchmark_mem_eff_attention.py    |  2 +-
 .../benchmark_mem_eff_atttention_mqa.py          |  3 +--
 xformers/benchmarks/benchmark_swiglu.py          |  2 +-
 xformers/benchmarks/benchmark_transformer.py     |  2 +-
 xformers/ops/__init__.py                         |  4 ++--
 xformers/ops/fmha/__init__.py                    | 14 ++++++++++++--
 xformers/ops/fmha/ck.py                          |  3 ++-
 xformers/ops/fmha/ck_decoder.py                  |  6 ++++--
 xformers/ops/fmha/ck_splitk.py                   | 12 ++++++++++--
 xformers/ops/fmha/common.py                      |  1 -
 xformers/ops/fmha/dispatch.py                    | 16 ++++++++++++++--
 xformers/ops/fmha/triton.py                      |  4 +---
 13 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/tests/test_mqa_forward_ck_tiled_discarded.py b/tests/test_mqa_forward_ck_tiled_discarded.py
index 5d11b8e40..fc91f0dcc 100644
--- a/tests/test_mqa_forward_ck_tiled_discarded.py
+++ b/tests/test_mqa_forward_ck_tiled_discarded.py
@@ -13,10 +13,10 @@
 from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
+from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
 from xformers.ops.common import get_xformers_operator
 from xformers.ops.fmha.common import AttentionOpBase
-from xformers.attn_bias_utils import create_attn_bias
 
 from .utils import assert_allclose
 
diff --git a/xformers/benchmarks/benchmark_mem_eff_attention.py b/xformers/benchmarks/benchmark_mem_eff_attention.py
index baaa7d2c8..5c5305a16 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention.py
@@ -10,11 +10,11 @@
 
 import torch
 from torch.utils import benchmark
-from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops
 import xformers.ops.fmha as fmha
 from xformers.attn_bias_utils import create_attn_bias
+from xformers.benchmarks.utils import benchmark_main_helper
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
diff --git a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
index 12b8f7b91..14e1700bd 100644
--- a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
+++ b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
@@ -10,12 +10,11 @@
 
 import torch
 from torch.utils import benchmark
-from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops
 import xformers.ops.fmha as fmha
-
 from xformers.attn_bias_utils import create_attn_bias
+from xformers.benchmarks.utils import benchmark_main_helper
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
diff --git a/xformers/benchmarks/benchmark_swiglu.py b/xformers/benchmarks/benchmark_swiglu.py
index fc59ac45d..b268d3f19 100644
--- a/xformers/benchmarks/benchmark_swiglu.py
+++ b/xformers/benchmarks/benchmark_swiglu.py
@@ -11,9 +11,9 @@
 
 import torch
 from torch.utils import benchmark
-from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops.swiglu_op as xsw
+from xformers.benchmarks.utils import benchmark_main_helper
 
 min_run_time = 0.5
 device = torch.device("cuda")
diff --git a/xformers/benchmarks/benchmark_transformer.py b/xformers/benchmarks/benchmark_transformer.py
index dad518331..2a6070b62 100644
--- a/xformers/benchmarks/benchmark_transformer.py
+++ b/xformers/benchmarks/benchmark_transformer.py
@@ -15,9 +15,9 @@
 from timm.models.vision_transformer import Attention as TimmAttention
 from timm.models.vision_transformer import Block as TimmBlock
 from torch.utils import benchmark
-from xformers.benchmarks.utils import benchmark_main_helper
 
 import xformers.ops as xops
+from xformers.benchmarks.utils import benchmark_main_helper
 
 
 def replace_module(module: nn.Module, replace_class, factory):
diff --git a/xformers/ops/__init__.py b/xformers/ops/__init__.py
index 9d1ef2608..25bbbfc4d 100644
--- a/xformers/ops/__init__.py
+++ b/xformers/ops/__init__.py
@@ -11,14 +11,14 @@
     AttentionOpBase,
     AttentionOpDispatch,
     LowerTriangularMask,
+    MemoryEfficientAttentionCkOp,
     MemoryEfficientAttentionCutlassFwdFlashBwOp,
     MemoryEfficientAttentionCutlassOp,
     MemoryEfficientAttentionFlashAttentionOp,
     MemoryEfficientAttentionOp,
+    MemoryEfficientAttentionSplitKCkOp,
     MemoryEfficientAttentionTritonFwdFlashBwOp,
     TritonFlashAttentionOp,
-    MemoryEfficientAttentionCkOp,
-    MemoryEfficientAttentionSplitKCkOp,
     memory_efficient_attention,
     memory_efficient_attention_backward,
     memory_efficient_attention_forward,
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index 06b995c30..b1da96542 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -7,8 +7,18 @@
 
 import torch
 
-
-from . import attn_bias, cutlass, decoder, flash, small_k, triton, triton_splitk, ck, ck_decoder, ck_splitk
+from . import (
+    attn_bias,
+    ck,
+    ck_decoder,
+    ck_splitk,
+    cutlass,
+    decoder,
+    flash,
+    small_k,
+    triton,
+    triton_splitk,
+)
 from .attn_bias import AttentionBias, BlockDiagonalMask, LowerTriangularMask
 from .common import (
     AttentionBwOpBase,
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 000a07e56..268b0dd1f 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -7,7 +7,7 @@
 from dataclasses import replace
 from enum import Enum
 from functools import partial
-from typing import Any, List, Optional, Set, Tuple, Union, Mapping
+from typing import Any, List, Mapping, Optional, Set, Tuple, Union
 
 import torch
 
@@ -35,6 +35,7 @@
     check_lastdim_alignment_stride1,
 )
 
+
 def _minimum_gemm_alignment(inp: Inputs) -> int:
     return 1
 
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 3579a3f0a..6b1d76f9c 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -1,10 +1,12 @@
 # TODO(max): add a proper copyright header
+from typing import Any, List, Optional, Set, Tuple
+
 import torch
 
-from typing import Any, Set, List, Tuple, Optional
+from ..common import get_xformers_operator, register_operator
 from .attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
 from .common import AttentionFwOpBase, Context, Inputs
-from ..common import get_xformers_operator, register_operator
+
 
 @register_operator
 class FwOp(AttentionFwOpBase):
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 49238f83d..3dd2fd7c7 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -1,8 +1,16 @@
+from typing import Any, List, Optional, Set, Tuple
+
 import torch
-from typing import Any, List, Set, Tuple, Optional
+
 from xformers.ops.common import get_xformers_operator, register_operator
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
-from xformers.ops.fmha.common import AttentionFwOpBase, Context, Inputs, check_lastdim_alignment_stride1
+from xformers.ops.fmha.common import (
+    AttentionFwOpBase,
+    Context,
+    Inputs,
+    check_lastdim_alignment_stride1,
+)
+
 
 @register_operator
 class FwOp(AttentionFwOpBase):
diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index 9808b5934..18ad70be4 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-from functools import partial
 import math
 from dataclasses import dataclass
 from functools import partial
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 0acb7eb35..0af07b3e9 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -5,11 +5,23 @@
 
 
 import textwrap
-import torch
 from collections import deque
 from typing import List, Sequence, Type, TypeVar
 
-from . import attn_bias, cutlass, decoder, flash, small_k, triton, triton_splitk, ck, ck_decoder, ck_splitk
+import torch
+
+from . import (
+    attn_bias,
+    ck,
+    ck_decoder,
+    ck_splitk,
+    cutlass,
+    decoder,
+    flash,
+    small_k,
+    triton,
+    triton_splitk,
+)
 from .common import AttentionBwOpBase, AttentionFwOpBase, Inputs
 
 
diff --git a/xformers/ops/fmha/triton.py b/xformers/ops/fmha/triton.py
index 6dccc1cb9..08018f56f 100644
--- a/xformers/ops/fmha/triton.py
+++ b/xformers/ops/fmha/triton.py
@@ -16,18 +16,16 @@
 from typing import Any, List, Mapping, Optional, Set, Tuple
 
 import torch
-
 import triton
 import triton.language as tl
 
 from ..common import register_operator
-
 from .attn_bias import (
     BlockDiagonalCausalMask,
     BlockDiagonalCausalWithOffsetPaddedKeysMask,
     LowerTriangularMask,
 )
-from .common import AttentionFwOpBase, check_lastdim_alignment_stride1, Context, Inputs
+from .common import AttentionFwOpBase, Context, Inputs, check_lastdim_alignment_stride1
 
 
 @triton.jit

From b8ebf080d247447a0199228c0045c81c0d60b45e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 6 Feb 2024 01:27:40 +0000
Subject: [PATCH 429/641] apply black

---
 setup.py                                      | 284 +++++++++++++-----
 tests/test_attentions.py                      |   7 +-
 tests/test_checkpoint.py                      |  14 +-
 tests/test_ck_7.py                            |  21 +-
 tests/test_core_attention.py                  |   7 +-
 tests/test_custom_ops.py                      |   7 +-
 tests/test_mem_eff_attention.py               | 147 ++++++---
 tests/test_mem_eff_attention_ck_discarded.py  | 105 ++++---
 tests/test_mqa_forward_ck_tiled_discarded.py  |  35 ++-
 tests/test_sparse_tensors.py                  |   8 +-
 tests/test_swiglu.py                          |   6 +-
 tests/test_triton_blocksparse.py              |   9 +-
 tests/test_triton_layernorm.py                |   6 +-
 .../benchmarks/benchmark_attn_decoding.py     |   5 +-
 .../benchmark_mem_eff_attn_decoder.py         |   4 +-
 .../benchmark_mem_eff_atttention_mqa.py       |  22 +-
 xformers/benchmarks/utils.py                  |   6 +-
 xformers/ops/common.py                        |   5 +-
 xformers/ops/fmha/__init__.py                 |   3 +-
 xformers/ops/fmha/ck.py                       |  45 ++-
 xformers/ops/fmha/ck_decoder.py               |  26 +-
 xformers/ops/fmha/ck_splitk.py                |  19 +-
 xformers/ops/fmha/common.py                   |   6 +-
 xformers/ops/fmha/dispatch.py                 |  26 +-
 24 files changed, 598 insertions(+), 225 deletions(-)

diff --git a/setup.py b/setup.py
index f56dbeca7..59867a805 100644
--- a/setup.py
+++ b/setup.py
@@ -214,54 +214,199 @@ def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
         )
     ]
 
+
 def rename_cpp_cu(cpp_files):
     for entry in cpp_files:
-        shutil.copy(entry, os.path.splitext(entry)[0] + '.cu')
+        shutil.copy(entry, os.path.splitext(entry)[0] + ".cu")
+
 
 def get_extensions():
     extensions_dir = os.path.join("xformers", "csrc")
 
-    sources = glob.glob(os.path.join(extensions_dir, "attention", "*.cpp"), recursive=False)
-    sources += glob.glob(os.path.join(extensions_dir, "attention", "autograd", "**", "*.cpp"), recursive=True)
-    sources += glob.glob(os.path.join(extensions_dir, "attention", "cpu", "**", "*.cpp"), recursive=True)
-    sources += glob.glob(os.path.join(extensions_dir, "indexing", "**", "*.cpp"), recursive=True)
-    sources += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cpp"), recursive=True)
-    
+    sources = glob.glob(
+        os.path.join(extensions_dir, "attention", "*.cpp"), recursive=False
+    )
+    sources += glob.glob(
+        os.path.join(extensions_dir, "attention", "autograd", "**", "*.cpp"),
+        recursive=True,
+    )
+    sources += glob.glob(
+        os.path.join(extensions_dir, "attention", "cpu", "**", "*.cpp"), recursive=True
+    )
+    sources += glob.glob(
+        os.path.join(extensions_dir, "indexing", "**", "*.cpp"), recursive=True
+    )
+    sources += glob.glob(
+        os.path.join(extensions_dir, "swiglu", "**", "*.cpp"), recursive=True
+    )
+
     ## avoid the temporary .cu file under xformers/csrc/attention/hip_fmha are included
     source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"), recursive=False)
-    source_cuda += glob.glob(os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True)
-    source_cuda += glob.glob(os.path.join(extensions_dir, "indexing", "**", "*.cu"), recursive=True)
-    source_cuda += glob.glob(os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True)
+    source_cuda += glob.glob(
+        os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True
+    )
+    source_cuda += glob.glob(
+        os.path.join(extensions_dir, "indexing", "**", "*.cu"), recursive=True
+    )
+    source_cuda += glob.glob(
+        os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True
+    )
+
+    source_hip = glob.glob(
+        os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"),
+        recursive=False,
+    )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"
+        ),
+        recursive=False,
+    )
 
-    source_hip = glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"), recursive=False)
-    source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False)
-    
     source_hip_decoder = [
-        *glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"), recursive=False),
-        *glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_splitk.cpp"), recursive=False)
+        *glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"
+            ),
+            recursive=False,
+        ),
+        *glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "attention_forward_splitk.cpp"
+            ),
+            recursive=False,
+        ),
     ]
 
     if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_backward_generic.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_infer_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_infer_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_forward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_forward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_backward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_backward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances", "ck_fmha_*.cpp"), recursive=False)
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "attention_backward_generic.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_infer_*.cpp"
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_infer_*.cpp"
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_forward_*.cpp"
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_forward_*.cpp"
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "ck_fmha_batched_backward_*.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "ck_fmha_grouped_backward_*.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir, "attention", "hip_fmha", "instances", "ck_fmha_*.cpp"
+            ),
+            recursive=False,
+        )
     else:
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "attention_forward_generic_ck_tiled.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_infer_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_infer_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_batched_forward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "ck_tiled_fmha_grouped_forward_*.cpp"), recursive=False)
-        source_hip += glob.glob(os.path.join(extensions_dir, "attention", "hip_fmha", "instances_tiled", "ck_tiled_fmha_*.cpp"), recursive=False)
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "attention_forward_generic_ck_tiled.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "ck_tiled_fmha_batched_infer_*.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "ck_tiled_fmha_grouped_infer_*.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "ck_tiled_fmha_batched_forward_*.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "ck_tiled_fmha_grouped_forward_*.cpp",
+            ),
+            recursive=False,
+        )
+        source_hip += glob.glob(
+            os.path.join(
+                extensions_dir,
+                "attention",
+                "hip_fmha",
+                "instances_tiled",
+                "ck_tiled_fmha_*.cpp",
+            ),
+            recursive=False,
+        )
 
     source_hip += source_hip_decoder
-    
+
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
     cutlass_examples_dir = os.path.join(this_dir, "third_party", "cutlass", "examples")
@@ -340,42 +485,46 @@ def get_extensions():
             "--ptxas-options=-O2",
             "--ptxas-options=-allow-expensive-optimizations=true",
         ]
-    elif torch.cuda.is_available() and torch.version.hip: 
-       rename_cpp_cu(source_hip)
-       source_hip_cu = []
-       for ff in source_hip:
-           source_hip_cu += [ff.replace(".cpp", ".cu")]
-
-       extension = CUDAExtension
-       sources += source_hip_cu
-       include_dirs += [ Path(this_dir) / 'xformers' / 'csrc' / 'attention' / 'hip_fmha' ]
-
-       if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
-           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel' / 'include']
-       else:
-           include_dirs += [ Path(this_dir) / 'third_party' / 'composable_kernel_tiled' / 'include']
-           
-       if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
-           generator_flag = []
-       else:
-           generator_flag = ["-DUSE_CK_TILED_KERNEL"]
-       cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
-       extra_compile_args={
+    elif torch.cuda.is_available() and torch.version.hip:
+        rename_cpp_cu(source_hip)
+        source_hip_cu = []
+        for ff in source_hip:
+            source_hip_cu += [ff.replace(".cpp", ".cu")]
+
+        extension = CUDAExtension
+        sources += source_hip_cu
+        include_dirs += [
+            Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
+        ]
+
+        if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
+            include_dirs += [
+                Path(this_dir) / "third_party" / "composable_kernel" / "include"
+            ]
+        else:
+            include_dirs += [
+                Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
+            ]
+
+        if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
+            generator_flag = []
+        else:
+            generator_flag = ["-DUSE_CK_TILED_KERNEL"]
+        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
+        extra_compile_args = {
             "cxx": ["-O3", "-std=c++17"] + generator_flag,
-            "nvcc":
-                [
-                    "-O3",
-                    "-std=c++17",
-                    f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
-                    "-U__CUDA_NO_HALF_OPERATORS__",
-                    "-U__CUDA_NO_HALF_CONVERSIONS__",
-                    "-DCK_FMHA_FWD_FAST_EXP2=1",
-                    "-fgpu-flush-denormals-to-zero",
-                ]
-                + generator_flag
-                + cc_flag
-            ,
-       } 
+            "nvcc": [
+                "-O3",
+                "-std=c++17",
+                f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
+                "-U__CUDA_NO_HALF_OPERATORS__",
+                "-U__CUDA_NO_HALF_CONVERSIONS__",
+                "-DCK_FMHA_FWD_FAST_EXP2=1",
+                "-fgpu-flush-denormals-to-zero",
+            ]
+            + generator_flag
+            + cc_flag,
+        }
 
     ext_modules.append(
         extension(
@@ -406,6 +555,7 @@ def get_extensions():
         },
     }
 
+
 class clean(distutils.command.clean.clean):  # type: ignore
     def run(self):
         if os.path.exists(".gitignore"):
diff --git a/tests/test_attentions.py b/tests/test_attentions.py
index 038c55baa..31f7721fb 100644
--- a/tests/test_attentions.py
+++ b/tests/test_attentions.py
@@ -22,7 +22,9 @@
     build_attention,
 )
 
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
 
 DEVICES = (
     [torch.device("cpu")] if not torch.cuda.is_available() else [torch.device("cuda")]
@@ -92,6 +94,7 @@ def noop(x):
 
     return multi_head
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("attn_dropout", [0.0, 0.3])
 @pytest.mark.parametrize("residual_dropout", [0.0, 0.1])
@@ -162,6 +165,7 @@ def test_order_invariance(
             with torch.cuda.amp.autocast(enabled=True):
                 _ = multi_head(inputs, inputs_shuffled, inputs)
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("heads", [1, 4])
 @pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
@@ -205,6 +209,7 @@ def test_kqv_ordering(
     res_false = multi_head(query=v, key=k, value=q)
     assert torch.allclose(res_false[0, :, :], res_false[1, :, :])
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("heads", [1, 4])
 @pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index eab74a172..8e456d345 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -14,7 +14,9 @@
 from xformers import checkpoint, list_operators
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
 _devices = ["cpu"]
 cuda_cap = (0, 0)
 
@@ -30,6 +32,7 @@ def _relu_policy(func, *args, **kwargs):
 def _all_policy(func, *args, **kwargs):
     return True
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("policy_fn", [None, [], _relu_policy, _all_policy])
 @pytest.mark.parametrize("input_requires_grad", [True, False])
@@ -103,7 +106,9 @@ def test_checkpoint_with_grad(policy_fn, input_requires_grad, grad_mode):
     "op",
     [
         xformers.ops.MemoryEfficientAttentionFlashAttentionOp,
-        xformers.ops.MemoryEfficientAttentionCutlassOp if torch.version.cuda else xformers.ops.MemoryEfficientAttentionCkOp,
+        xformers.ops.MemoryEfficientAttentionCutlassOp
+        if torch.version.cuda
+        else xformers.ops.MemoryEfficientAttentionCkOp,
     ],
 )
 def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast, op):
@@ -113,7 +118,10 @@ def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast,
     ):
         pytest.skip("skipping operator not supported in this arch")
 
-    if op is xformers.ops.MemoryEfficientAttentionFlashAttentionOp and torch.version.hip:
+    if (
+        op is xformers.ops.MemoryEfficientAttentionFlashAttentionOp
+        and torch.version.hip
+    ):
         pytest.skip("FlashAttentionOp is not supported on ROCM!")
 
     if op is xformers.ops.MemoryEfficientAttentionCkOp and op[0].IS_CK_TILED:
diff --git a/tests/test_ck_7.py b/tests/test_ck_7.py
index 00a42ead0..6f6124945 100644
--- a/tests/test_ck_7.py
+++ b/tests/test_ck_7.py
@@ -36,6 +36,7 @@
     fmha.ck.BwOp,
 ]
 
+
 def sample_random_supported_fw(
     inp: fmha.Inputs, seed: int
 ) -> Type[fmha.common.AttentionFwOpBase]:
@@ -646,7 +647,9 @@ def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
     key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
     value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
 
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, op=(fmha.ck.FwOp, None)
+    )
     # this should be equivalent to the average over value
     ref = value.mean(1, keepdim=True).expand_as(query)
 
@@ -655,6 +658,7 @@ def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
     else:
         assert_allclose(out, ref, atol=1e-2)
 
+
 def _block_diag_reshape_lse(
     lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
 ) -> torch.Tensor:
@@ -732,14 +736,21 @@ def test_backward(
     ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
     if k > 128 or kv > 128:
-        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention-1")
+        pytest.skip(
+            "head-dim length bigger than 128 is not supported by CK-FlashAttention-1"
+        )
 
     if k % 8 != 0 or kv % 8 != 0:
         pytest.skip("head-dim length must be an even value for CK-FlashAttention-1")
 
     ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
-    if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask and q_len <= kv_len:
-        pytest.skip("BlockDiagonalCausalFromBottomRightMask requires kv_len bigger than q_len")
+    if (
+        bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
+        and q_len <= kv_len
+    ):
+        pytest.skip(
+            "BlockDiagonalCausalFromBottomRightMask requires kv_len bigger than q_len"
+        )
 
     if k != kv:
         pytest.skip("k same as kv is not well tested by CK-FlashAttention-1")
@@ -864,5 +875,3 @@ def test_backward(
             atol=atol,
             rtol=rtol,
         )
-
-
diff --git a/tests/test_core_attention.py b/tests/test_core_attention.py
index 81a403e59..ba8433da4 100644
--- a/tests/test_core_attention.py
+++ b/tests/test_core_attention.py
@@ -21,7 +21,10 @@
     _is_triton_available() and not gpu_capabilities_older_than_70()
 )
 
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
+
 
 def catch_oor(fn):
     @functools.wraps(fn)
@@ -133,6 +136,7 @@ def test_amp_attention_sparse(device):
     expected_device = torch.float32
     assert r.dtype == expected_device
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_sparsecs(device):
@@ -149,6 +153,7 @@ def test_amp_attention_sparsecs(device):
     expected_device = torch.float32
     assert r.dtype == expected_device
 
+
 @disable_on_rocm
 @pytest.mark.skipif(
     not _is_blocksparse_available, reason="Blocksparse is not available"
diff --git a/tests/test_custom_ops.py b/tests/test_custom_ops.py
index 0a8f053d3..676952df7 100644
--- a/tests/test_custom_ops.py
+++ b/tests/test_custom_ops.py
@@ -17,7 +17,9 @@
 )
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
 
 _devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
 
@@ -134,6 +136,7 @@ def compute_grads(f):
     assert torch.allclose(grad_a, a.grad)
     assert torch.allclose(grad_b, b.grad)
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sddmm_sputnik(device):
@@ -253,6 +256,7 @@ def test_sddmm_coo(L, M, K, prob):
     assert res.dtype == res_gt.dtype
     assert torch.allclose(res, res_gt, atol=1e-6)
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sddmm_sputnik_backward(device):
@@ -332,6 +336,7 @@ def test_sparse_softmax_sputnik_backward(device):
         grad_a, a.grad.coalesce().values().reshape_as(grad_a), atol=1e-7
     )
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_spmm_sputnik(device):
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 183627d0b..ab4442f77 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -26,8 +26,12 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-rocm_only = pytest.mark.skipif(not torch.cuda.is_available() or not torch.version.hip, reason="requires ROCM")
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+rocm_only = pytest.mark.skipif(
+    not torch.cuda.is_available() or not torch.version.hip, reason="requires ROCM"
+)
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
 
 compute_capability = (0, 0)
 if torch.cuda.is_available():
@@ -313,7 +317,10 @@ def T(t):
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
-def ref_attention_splitk_bmhk(q, k, v, attn_bias, scale=None, split_k=None, dtype=None) -> torch.Tensor:
+
+def ref_attention_splitk_bmhk(
+    q, k, v, attn_bias, scale=None, split_k=None, dtype=None
+) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -327,12 +334,18 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention_splitk(T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype)
+    out = ref_attention_splitk(
+        T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype
+    )
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
-def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2, dtype=None) -> torch.Tensor:
+
+def ref_attention_splitk(
+    q, k, v, attn_bias, scale=None, split_k=2, dtype=None
+) -> torch.Tensor:
     if q.ndim == 5:
+
         def attn_bias_group(group: int):
             if isinstance(attn_bias, torch.Tensor):
                 return attn_bias[:, group]
@@ -345,7 +358,12 @@ def attn_bias_group(group: int):
         return torch.stack(
             [
                 ref_attention_splitk_bmhk(
-                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), split_k=split_k, dtype=dtype
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    attn_bias=attn_bias_group(g),
+                    split_k=split_k,
+                    dtype=dtype,
                 )
                 for g in range(q.shape[2])
             ],
@@ -353,7 +371,9 @@ def attn_bias_group(group: int):
         )
 
     if q.ndim == 4:
-        return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype)
+        return ref_attention_splitk_bmhk(
+            q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype
+        )
     assert q.ndim == 3
     if dtype is None:
         dtype = torch.float32
@@ -362,7 +382,7 @@ def attn_bias_group(group: int):
     v = v.to(dtype=dtype)
 
     if scale is None:
-        scale = q.shape[-1] ** -.5
+        scale = q.shape[-1] ** -0.5
     assert not q.isnan().any()
     q = q * scale
     assert not q.isnan().any()
@@ -384,15 +404,17 @@ def attn_bias_group(group: int):
             )
 
     split_size = k.size(-2) // split_k
-    split_config = { "dim": -2, "split_size_or_sections": split_size}
+    split_config = {"dim": -2, "split_size_or_sections": split_size}
     k_split = torch.split(k, **split_config)
     v_split = torch.split(v, **split_config)
-    attn_bias_split = torch.split(attn_bias_tensor, dim=-1, split_size_or_sections=split_size)
+    attn_bias_split = torch.split(
+        attn_bias_tensor, dim=-1, split_size_or_sections=split_size
+    )
 
     def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         p_slice = q_whole @ k_slice.transpose(-2, -1)
         p_slice += attn_bias_slice
-        m = torch.max(p_slice, dim = -1, keepdim=True).values
+        m = torch.max(p_slice, dim=-1, keepdim=True).values
         p_slice_scaled = p_slice - m
         p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
         s = torch.exp(p_slice_scaled)
@@ -406,8 +428,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
     splits = list(zip(k_split, v_split, attn_bias_split))
 
-    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]),
-        splits))
+    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]), splits))
     out = torch.zeros_like(q)
 
     # reduce out over split-k slices
@@ -422,11 +443,11 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
         log_alpha = -torch.abs(local_max - global_max)
         alpha = torch.exp(log_alpha)
-        alpha.nan_to_num_(1.)
+        alpha.nan_to_num_(1.0)
 
         pick_new = local_max < global_max
-        new_coef = torch.where(pick_new, alpha, 1.)
-        curr_coef = torch.where(pick_new, 1., alpha)
+        new_coef = torch.where(pick_new, alpha, 1.0)
+        curr_coef = torch.where(pick_new, 1.0, alpha)
 
         out = out * curr_coef + local_out * new_coef
         global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
@@ -434,6 +455,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     out /= global_sumexp
     return out
 
+
 ## this interface assumes the tensor is in BMHK, but q and k/v might have different number of heads
 def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
     assert q.ndim == 4
@@ -462,14 +484,18 @@ def attn_bias_head(head: int):
     q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
 
     return torch.stack(
-           [
-              ref_attention_bmhk(
-                   q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h),
-              )
-              for h in range(q_bmghk.shape[3])
-           ],
-           dim=3,
-       ).reshape((B, M, Hq, Kv))
+        [
+            ref_attention_bmhk(
+                q_bmghk[:, :, :, h],
+                k,
+                v,
+                attn_bias=attn_bias_head(h),
+            )
+            for h in range(q_bmghk.shape[3])
+        ],
+        dim=3,
+    ).reshape((B, M, Hq, Kv))
+
 
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
@@ -618,7 +644,10 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if op is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     if packed and not (k == kv and q_len == kv_len):
@@ -682,13 +711,16 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
 
+
 @rocm_only
 @pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
 @pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
 @pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000)])
 @pytest.mark.parametrize("batches", [100, 64, 1])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize(
+    "attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask]
+)
 @pytest.mark.parametrize("op", [fmha.ck.FwOp])
 def test_mqa_forward(
     op,
@@ -716,7 +748,7 @@ def test_mqa_forward(
     if op is fmha.ck.FwOp and not op.IS_CK_TILED:
         pytest.skip("mqa/gqa is only supported with ck-tiled fmha")
 
-    torch.manual_seed(B * M + N * K + Hq*Hkv + Kv)
+    torch.manual_seed(B * M + N * K + Hq * Hkv + Kv)
 
     scale = 3
     query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
@@ -815,7 +847,10 @@ def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
     if op is fmha.ck.FwOp and op.IS_CK_TILED:
         pytest.skip("logsumexp is not yet supported by ck-tiled fmha!")
 
-    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if op is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     query, key, value, attn_bias = create_tensors(
@@ -1317,7 +1352,10 @@ def test_cuda_streams(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     if device != "cuda":
         pytest.skip("Not CUDA")
-    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if op is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     bias_type = None
@@ -1463,7 +1501,10 @@ def test_grad_checkpointing(
         pytest.skip("Triton Flash Attention 2 doesn't support backward pass yet")
     if op is fmha.ck.FwOp and op.IS_CK_TILED:
         pytest.skip("ck-tiled FMHA doesn't supported backward pass yet")
-    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if op is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     bias_type = None
@@ -1538,7 +1579,10 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
         0, 3, 1, 2
     )
 
-    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if op is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     try:
@@ -1557,7 +1601,10 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
 def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
     q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
 
-    if op is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if op is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     try:
@@ -1955,7 +2002,7 @@ def dequant_cache(x):
 
     if torch.version.cuda:
         cutlass_output = fmha.memory_efficient_attention_forward(
-             q, k, v, attn_bias, op=fmha.cutlass.FwOp
+            q, k, v, attn_bias, op=fmha.cutlass.FwOp
         )
 
         assert_allclose(
@@ -2023,8 +2070,11 @@ def test_triton_splitk_decoder(
         dequant=dequant,
     )
 
+
 @rocm_only
-@pytest.mark.parametrize("op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4])
+@pytest.mark.parametrize(
+    "op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4]
+)
 @pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
@@ -2037,7 +2087,7 @@ def test_splitk_decoder(
     padding: int,
     bsz: int,
     dtype: str,
-    d: int
+    d: int,
 ) -> None:
     # no quantized impl compared to cuda
     test_decoder(
@@ -2050,6 +2100,7 @@ def test_splitk_decoder(
         d=d,
     )
 
+
 def test_attn_bias_from_seqlens() -> None:
     bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
     out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
@@ -2320,7 +2371,10 @@ def test_forward_gqa_one_group(opFW):
     k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
     v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
 
-    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if opFW is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     supported = opFW.supports(fmha.Inputs(q, k, v))
@@ -2379,6 +2433,7 @@ def _dispatches_to_flash_decoding(q, kv):
         _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
     )
 
+
 @disable_on_rocm
 def test_dispatch_decoding_bmhk() -> None:
     assert not _dispatches_to_splitK(
@@ -2401,6 +2456,7 @@ def test_dispatch_decoding_bmhk() -> None:
         torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
     ), "Should not use SplitK if B is big"
 
+
 @disable_on_rocm
 def test_dispatch_decoding_bmghk() -> None:
     assert not _dispatches_to_splitK(
@@ -2485,7 +2541,7 @@ def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
     k = k.expand(-1, -1, H, -1)
     v = v.expand(-1, -1, H, -1)
 
-    if (sys.version_info.major, sys.version_info.minor) <= (3, 8): 
+    if (sys.version_info.major, sys.version_info.minor) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     if not op.supports(fmha.Inputs(q, k, v)):
@@ -2513,7 +2569,10 @@ def test_empty_tensors_empty_query(
     if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
-    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if opFW is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     query = query[:, :0]
@@ -2540,8 +2599,11 @@ def test_empty_tensors_empty_kv(
 
     if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
-        
-    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+
+    if opFW is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     key = key[:, :0]
@@ -2569,7 +2631,10 @@ def test_empty_tensors_empty_b(
     if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
-    if opFW is fmha.triton_splitk.FwOp and (sys.version_info.major, sys.version_info.minor) <= (3, 8):
+    if opFW is fmha.triton_splitk.FwOp and (
+        sys.version_info.major,
+        sys.version_info.minor,
+    ) <= (3, 8):
         pytest.skip("triton_splitk requires python 3.9 or above!")
 
     query, key, value = query[:0], key[:0], value[:0]
diff --git a/tests/test_mem_eff_attention_ck_discarded.py b/tests/test_mem_eff_attention_ck_discarded.py
index 633ad761b..2c91ad1d9 100644
--- a/tests/test_mem_eff_attention_ck_discarded.py
+++ b/tests/test_mem_eff_attention_ck_discarded.py
@@ -39,6 +39,7 @@
     fmha.ck.BwOp,
 ]
 
+
 def sample_random_supported_fw(
     inp: fmha.Inputs, seed: int
 ) -> Type[fmha.common.AttentionFwOpBase]:
@@ -289,7 +290,9 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-def ref_attention_splitk_bmhk(q, k, v, attn_bias, scale=None, split_k=None, dtype=None) -> torch.Tensor:
+def ref_attention_splitk_bmhk(
+    q, k, v, attn_bias, scale=None, split_k=None, dtype=None
+) -> torch.Tensor:
     assert q.ndim == 4
 
     def T(t):
@@ -303,13 +306,18 @@ def T(t):
             device=q.device,
             dtype=torch.float32,
         ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention_splitk(T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype)
+    out = ref_attention_splitk(
+        T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype
+    )
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
 
-def ref_attention_splitk(q, k, v, attn_bias, scale=None, split_k=2, dtype=None) -> torch.Tensor:
+def ref_attention_splitk(
+    q, k, v, attn_bias, scale=None, split_k=2, dtype=None
+) -> torch.Tensor:
     if q.ndim == 5:
+
         def attn_bias_group(group: int):
             if isinstance(attn_bias, torch.Tensor):
                 return attn_bias[:, group]
@@ -322,7 +330,12 @@ def attn_bias_group(group: int):
         return torch.stack(
             [
                 ref_attention_splitk_bmhk(
-                    q[:, :, g], k[:, :, g], v[:, :, g], attn_bias=attn_bias_group(g), split_k=split_k, dtype=dtype
+                    q[:, :, g],
+                    k[:, :, g],
+                    v[:, :, g],
+                    attn_bias=attn_bias_group(g),
+                    split_k=split_k,
+                    dtype=dtype,
                 )
                 for g in range(q.shape[2])
             ],
@@ -330,7 +343,9 @@ def attn_bias_group(group: int):
         )
 
     if q.ndim == 4:
-        return ref_attention_splitk_bmhk(q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype)
+        return ref_attention_splitk_bmhk(
+            q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype
+        )
     assert q.ndim == 3
     if dtype is None:
         dtype = torch.float32
@@ -339,7 +354,7 @@ def attn_bias_group(group: int):
     v = v.to(dtype=dtype)
 
     if scale is None:
-        scale = q.shape[-1] ** -.5
+        scale = q.shape[-1] ** -0.5
     assert not q.isnan().any()
     q = q * scale
     assert not q.isnan().any()
@@ -361,15 +376,17 @@ def attn_bias_group(group: int):
             )
 
     split_size = k.size(-2) // split_k
-    split_config = { "dim": -2, "split_size_or_sections": split_size}
+    split_config = {"dim": -2, "split_size_or_sections": split_size}
     k_split = torch.split(k, **split_config)
     v_split = torch.split(v, **split_config)
-    attn_bias_split = torch.split(attn_bias_tensor, dim=-1, split_size_or_sections=split_size)
-    
+    attn_bias_split = torch.split(
+        attn_bias_tensor, dim=-1, split_size_or_sections=split_size
+    )
+
     def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         p_slice = q_whole @ k_slice.transpose(-2, -1)
         p_slice += attn_bias_slice
-        m = torch.max(p_slice, dim = -1, keepdim=True).values
+        m = torch.max(p_slice, dim=-1, keepdim=True).values
         p_slice_scaled = p_slice - m
         p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
         s = torch.exp(p_slice_scaled)
@@ -378,13 +395,12 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         return {
             "attn_slice": attn_slice,
             "row_max": m,
-            "row_lse": l, 
+            "row_lse": l,
         }
-    
+
     splits = list(zip(k_split, v_split, attn_bias_split))
 
-    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]), 
-        splits))
+    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]), splits))
     out = torch.zeros_like(q)
 
     # reduce out over split-k slices
@@ -399,11 +415,11 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
 
         log_alpha = -torch.abs(local_max - global_max)
         alpha = torch.exp(log_alpha)
-        alpha.nan_to_num_(1.)
+        alpha.nan_to_num_(1.0)
 
         pick_new = local_max < global_max
-        new_coef = torch.where(pick_new, alpha, 1.)
-        curr_coef = torch.where(pick_new, 1., alpha)
+        new_coef = torch.where(pick_new, alpha, 1.0)
+        curr_coef = torch.where(pick_new, 1.0, alpha)
 
         out = out * curr_coef + local_out * new_coef
         global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
@@ -634,7 +650,9 @@ def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
     key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
     value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
 
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(fmha.ck.FwOp, None))
+    out = xformers.ops.memory_efficient_attention(
+        query, key, value, op=(fmha.ck.FwOp, None)
+    )
     # this should be equivalent to the average over value
     ref = value.mean(1, keepdim=True).expand_as(query)
 
@@ -643,6 +661,7 @@ def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
     else:
         assert_allclose(out, ref, atol=1e-2)
 
+
 def _block_diag_reshape_lse(
     lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
 ) -> torch.Tensor:
@@ -750,16 +769,22 @@ def test_backward(
 
     ## ToDo: reopen bfloat16 for testing
     if dtype is torch.bfloat16:
-        pytest.skip("Temporarily disabled bfloat16 as we are still improving the accuracy of the results")
+        pytest.skip(
+            "Temporarily disabled bfloat16 as we are still improving the accuracy of the results"
+        )
 
     if k > 128 or kv > 128:
-        pytest.skip("head-dim length bigger than 128 is not supported by CK-FlashAttention")
+        pytest.skip(
+            "head-dim length bigger than 128 is not supported by CK-FlashAttention"
+        )
 
     if k % 2 != 0:
-        pytest.skip("head-dim length must be an even value for CK-FlashAttention")    
+        pytest.skip("head-dim length must be an even value for CK-FlashAttention")
 
     if grad_out_contiguous is False:
-        pytest.skip("CK-FlashAttention requires grad_out and out have same lengths/strides")
+        pytest.skip(
+            "CK-FlashAttention requires grad_out and out have same lengths/strides"
+        )
 
     attn_bias_requires_grad = (
         random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
@@ -913,13 +938,14 @@ def _vec_binom_test(x, n, p):
     pval = np.minimum(1.0, pval)
     return pval
 
+
 def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
     if op == fmha.ck.FwOp:
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
         ## rand_uniform is an int32 tensor
         rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
         ##mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
-        mask = (rand_uniform <= int((1.0-p)*255.0)).to(torch.float32)
+        mask = (rand_uniform <= int((1.0 - p) * 255.0)).to(torch.float32)
         mask = mask.reshape(batch_size, q_len, kv_len)
     else:
         mask = torch.empty((batch_size, q_len, kv_len), device=device)
@@ -927,6 +953,7 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 
     return mask
 
+
 @cuda_only
 @pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
 @pytest.mark.parametrize("seed", [42, 124])
@@ -941,7 +968,7 @@ def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias
     from scipy.stats import binomtest
 
     device = "cuda"
-    scale = 0.05 
+    scale = 0.05
     query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
     key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
     value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
@@ -966,7 +993,9 @@ def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias
     torch.manual_seed(seed)
     mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
     ref = ref_attention(query, key, value, attn_bias, mask, p)
-    assert_allclose(out.float(), ref, atol=3e-3, rtol=5e-4), f"{(out - ref).abs().max()}"
+    assert_allclose(
+        out.float(), ref, atol=3e-3, rtol=5e-4
+    ), f"{(out - ref).abs().max()}"
 
     num_trials = 1000
     p_val_tol = 1e-6
@@ -989,7 +1018,7 @@ def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
     if not op.is_available():
         pytest.skip()
 
-    scale = 3 
+    scale = 3
     device = "cuda"
     query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
     key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
@@ -1415,6 +1444,7 @@ def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
         q = q.contiguous()
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
 
+
 def test_attn_bias_causal() -> None:
     m = -math.inf
     causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
@@ -1643,6 +1673,7 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
         return "mq"
     return f"gqa{kv_heads}"
 
+
 @pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
@@ -1752,12 +1783,10 @@ def test_decoder(
         kv_padding=padding,
     )
     inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
-    if (not_supported_reasons := op.not_supported_reasons(inp)):
+    if not_supported_reasons := op.not_supported_reasons(inp):
         pytest.skip(f"{not_supported_reasons=}")
 
-    decoder_output = fmha.memory_efficient_attention_forward(
-        q, k, v, attn_bias, op=op
-    )
+    decoder_output = fmha.memory_efficient_attention_forward(q, k, v, attn_bias, op=op)
 
     ref_output = ref_attention(q, k, v, attn_bias)
 
@@ -1769,7 +1798,9 @@ def test_decoder(
     )
 
 
-@pytest.mark.parametrize("op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4])
+@pytest.mark.parametrize(
+    "op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4]
+)
 @pytest.mark.parametrize("dtype", ["f32"])
 @pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
 @pytest.mark.parametrize("n_heads", [16])
@@ -1782,7 +1813,7 @@ def test_splitk_decoder(
     padding: int,
     bsz: int,
     dtype: str,
-    d: int
+    d: int,
 ) -> None:
     # no quantized impl compared to cuda
     test_decoder(
@@ -1826,7 +1857,9 @@ def test_attn_bias_blockdiag_doc() -> None:
     linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
 
     q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
-    out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None))
+    out = fmha.memory_efficient_attention(
+        q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None)
+    )
     list_out = attn_bias.split(out)
     assert tuple(list_out[0].shape) == (1, 3, 1, K)
 
@@ -2072,7 +2105,8 @@ def test_forward_gqa_one_group(opFW):
         rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
     )
 
-'''
+
+"""
 @sm80_or_better_only
 def test_flash_gqa_wrong_strides() -> None:
     op = (fmha.flash.FwOp, None)
@@ -2098,7 +2132,8 @@ def test_flash_gqa_wrong_strides() -> None:
         :, :, :, :, :K
     ]
     fmha.memory_efficient_attention(q, kv, kv, op=op)
-'''
+"""
+
 
 def _dispatches_to_splitK(q, kv):
     return (
diff --git a/tests/test_mqa_forward_ck_tiled_discarded.py b/tests/test_mqa_forward_ck_tiled_discarded.py
index fc91f0dcc..a1823dfd6 100644
--- a/tests/test_mqa_forward_ck_tiled_discarded.py
+++ b/tests/test_mqa_forward_ck_tiled_discarded.py
@@ -38,7 +38,10 @@
 ck_check_op = get_xformers_operator("is_ck_tiled_used")
 use_ck_tiled = ck_check_op()
 
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+
+def ref_attention(
+    q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None
+):
     if q.ndim == 4:
         B, M, Hq, K = q.shape
         _, N, Hkv, Kv = v.shape
@@ -47,13 +50,13 @@ def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dt
         def attn_bias_head(head: int):
             if isinstance(attn_bias, torch.Tensor):
                 assert attn_bias.ndim == 4
-                _, H, _, _ = attn_bias.shape        
+                _, H, _, _ = attn_bias.shape
                 assert H == Hq
                 bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
                 return bias_bghmn[:, :, head]
             if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
                 assert attn_bias._bias.ndim == 4
-                _, H, _, _ = attn_bias._bias.shape        
+                _, H, _, _ = attn_bias._bias.shape
                 assert H == Hq
                 bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
 
@@ -73,7 +76,7 @@ def attn_bias_head(head: int):
             ],
             dim=3,
         ).reshape((B, M, Hq, Kv))
-     
+
     assert q.ndim == 3
     if dtype is None:
         dtype = torch.float32
@@ -125,24 +128,27 @@ def T(t):
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
+
 @pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
 @pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
 @pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000)])
 @pytest.mark.parametrize("batches", [100, 64, 1])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize(
+    "attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask]
+)
 @pytest.mark.parametrize("op", ALL_FW_OPS)
 def test_mqa_forward(
     op,
     attn_bias_type,
-    dtype, 
-    batches: int, 
-    seqlen_kv: int, 
-    seqlen_q: int, 
-    nhead_kv: int, 
-    nhead_q: int, 
-    hdim_v: int, 
-    hdim_k: int, 
+    dtype,
+    batches: int,
+    seqlen_kv: int,
+    seqlen_q: int,
+    nhead_kv: int,
+    nhead_q: int,
+    hdim_v: int,
+    hdim_k: int,
 ):
     B = batches
     M = seqlen_q
@@ -158,7 +164,7 @@ def test_mqa_forward(
     if not use_ck_tiled:
         pytest.skip("mqa/gqa is only supported with ck-tiled")
 
-    torch.manual_seed(B * M + N * K + Hq*Hkv + Kv)
+    torch.manual_seed(B * M + N * K + Hq * Hkv + Kv)
 
     scale = 3
     query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
@@ -208,4 +214,3 @@ def test_mqa_forward(
         atol=op.ERROR_ATOL[dtype],
         rtol=op.ERROR_RTOL.get(dtype, 1e-5),
     )
-
diff --git a/tests/test_sparse_tensors.py b/tests/test_sparse_tensors.py
index e32cb8b37..21246c175 100644
--- a/tests/test_sparse_tensors.py
+++ b/tests/test_sparse_tensors.py
@@ -15,7 +15,10 @@
 _devices = ["cpu", "cuda:0"] if torch.cuda.is_available() else ["cpu"]
 _tensor_types = [BlockSparseTensor, SparseCSRTensor]
 
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
+
 
 def _create_blocksparse_tensor(
     device, block_size=32, Z=8, C=2, H=64, W=64, dtype=torch.float32
@@ -101,6 +104,7 @@ def test_sparse_binary_ops(func, device):
 
     assert torch.allclose(res, res_gt)
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
@@ -153,6 +157,7 @@ def test_masked_matmul(tensor_type, device):
     assert torch.allclose(a.grad, aa.grad, atol=atol)
     assert torch.allclose(b.grad, bb.grad, atol=atol)
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
@@ -202,6 +207,7 @@ def test_bmm(tensor_type, device):
         a_grad, a_sparse.grad.to_dense(), atol=atol
     ), f"{torch.max(torch.abs(a_grad-a_sparse.grad.to_dense()))}"
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
diff --git a/tests/test_swiglu.py b/tests/test_swiglu.py
index 78112a6ed..97468a6a2 100644
--- a/tests/test_swiglu.py
+++ b/tests/test_swiglu.py
@@ -24,7 +24,10 @@
     _is_sm80 = False
 sm80_only = pytest.mark.skipif(not _is_sm80, reason="requires sm80")
 
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
+
 
 def assert_allclose(
     # The output of the tested function
@@ -113,6 +116,7 @@ def generate_test_shapes():
 def create_module_cached(**kwargs) -> xsw.SwiGLU:
     return xsw.SwiGLU(**kwargs)
 
+
 @disable_on_rocm
 @pytest.mark.parametrize("autocast", [False, True], ids=["regular", "autocast"])
 @pytest.mark.parametrize("op", _ops, ids=[x.NAME for x in _ops])
diff --git a/tests/test_triton_blocksparse.py b/tests/test_triton_blocksparse.py
index 5bf19aa97..8d8330f04 100644
--- a/tests/test_triton_blocksparse.py
+++ b/tests/test_triton_blocksparse.py
@@ -14,7 +14,10 @@
 from xformers.components.attention.attention_patterns import block_sparsify_tensor
 from xformers.triton.utils import get_current_cuda_device
 
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
+
 
 def catch_oor(fn):
     @functools.wraps(fn)
@@ -63,6 +66,7 @@ def mask_tensor(x, mask, block, value=0):
         ret[:, h, i * block : (i + 1) * block, j * block : (j + 1) * block] = value
     return ret
 
+
 @disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.skipif(
@@ -118,6 +122,7 @@ def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=32, H=2, M=512, N=384, K
     # compare
     torch.testing.assert_close(rc, tc)
 
+
 @disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("BLOCK", [32, 128])
@@ -148,6 +153,7 @@ def test_softmax(BLOCK, WIDTH, DTYPE):
     # compare
     torch.testing.assert_close(ry, ty)
 
+
 @disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("block", [32, 43, 128])  # 16, 32,
@@ -221,6 +227,7 @@ def loss_fn(x):
                 msg=f"Triton grad {torch.norm(g1).item()} and torch grad {torch.norm(g2).item()}",
             )
 
+
 @disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("dtype", [torch.float16])
diff --git a/tests/test_triton_layernorm.py b/tests/test_triton_layernorm.py
index 3946061ee..c7a8e06b4 100644
--- a/tests/test_triton_layernorm.py
+++ b/tests/test_triton_layernorm.py
@@ -12,7 +12,9 @@
 
 import xformers
 
-disable_on_rocm = pytest.mark.skipif(not not torch.version.hip, reason="could not be done on ROCM")
+disable_on_rocm = pytest.mark.skipif(
+    not not torch.version.hip, reason="could not be done on ROCM"
+)
 
 try:
     from xformers.triton import FusedLayerNorm
@@ -36,6 +38,7 @@
     (1, 2048, 12288),
 ]
 
+
 @disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton is not available")
 @pytest.mark.skipif(
@@ -104,6 +107,7 @@ def test_layernorm_parity(shape, amp):
             + f" {torch.norm(triton_layernorm.bias.grad)}"
         )
 
+
 @disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton is not available")
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.bfloat16])
diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index e1298592c..31883008b 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -18,7 +18,8 @@
 
 CASES = [
     dict(B=max(1, 2 ** (16 - i)), Mq=1, Mkv=2**i, Hq=16, Hkv=hkv, K=128)
-    for i in range(8, 18) for hkv in (1, 2)
+    for i in range(8, 18)
+    for hkv in (1, 2)
 ]
 
 
@@ -110,7 +111,7 @@ class AttentionDecodingSplitKV(AttentionDecodingFlashDecoding):
 
 class AttentionDecodingCKSplitKV(AttentionDecodingFlashDecoding):
     OP = xops.fmha.ck_splitk.FwOp
- 
+
 
 class AttentionDecodingPyTorchRepeat(AttentionDecodingFlashDecoding):
     def fw(self) -> None:
diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
index 9fa58e7dd..7616d702d 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
@@ -60,7 +60,9 @@ def T(t):
 
 OPS = [
     xformers.ops.fmha.cutlass.FwOp if torch.version.cuda else xformers.ops.fmha.ck.FwOp,
-    xformers.ops.fmha.decoder.FwOp if torch.version.cuda else xformers.ops.fmha.ck_decoder.FwOp,
+    xformers.ops.fmha.decoder.FwOp
+    if torch.version.cuda
+    else xformers.ops.fmha.ck_decoder.FwOp,
 ]
 
 KV_SHAPES = [
diff --git a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
index 14e1700bd..ae6f11b15 100644
--- a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
+++ b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
@@ -19,7 +19,9 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 ## this interface assumes the tensor is in BMHK, but q and k/v might has different number of heads
-def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None):
+def ref_attention_mqa(
+    q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None
+):
     if q.ndim == 4:
         B, M, Hq, K = q.shape
         _, N, Hkv, Kv = v.shape
@@ -87,6 +89,7 @@ def attn_bias_head(head: int):
         attn = attn * (drop_mask / (1 - p))
     return attn @ v
 
+
 ## ref_attention_bmhk is completely the same as used by test_forward_ck_tiled.py
 def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
     assert q.ndim == 4
@@ -106,6 +109,7 @@ def T(t):
     out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
     return out.permute((0, 2, 1, 3))
 
+
 min_run_time = 0.5
 device = torch.device("cuda")
 
@@ -123,7 +127,7 @@ def T(t):
     ##*sorted(itertools.product([1, 2], [2048, 4096], [2048, 4096], [4, 8], [1, 2], [128])),
     ##*sorted(
     ##    itertools.product([16], [128, 512], [512, 1024], [16], [2, 4], [64, 128])
-    #),
+    # ),
 ]
 
 OPS = [
@@ -168,11 +172,18 @@ def product_dict(**kwargs):
 
 def create_tensors(shape, dtype, requires_grad=False):
     B, M, N, Hq, Hkv, K = shape
-    q = torch.rand([B, M, Hq, K], device=device, dtype=dtype, requires_grad=requires_grad)
-    k = torch.rand([B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad)
-    v = torch.rand([B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad)
+    q = torch.rand(
+        [B, M, Hq, K], device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    k = torch.rand(
+        [B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    v = torch.rand(
+        [B, N, Hkv, K], device=device, dtype=dtype, requires_grad=requires_grad
+    )
     return q, k, v
 
+
 def mem_eff_attention_fw(shape, num_threads: int, attn_bias_type, dropout_p, dtype):
     B, M, N, Hq, Hkv, K = shape
     nhead_ratio_qk = Hq // Hkv
@@ -245,4 +256,5 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_type, dropout_p, dty
         num_threads=num_threads,
     )
 
+
 benchmark_main_helper(mem_eff_attention_fw, CASES, min_run_time=min_run_time)
diff --git a/xformers/benchmarks/utils.py b/xformers/benchmarks/utils.py
index 5e18a84ef..0c94df1b6 100644
--- a/xformers/benchmarks/utils.py
+++ b/xformers/benchmarks/utils.py
@@ -662,8 +662,12 @@ def matches_current(r):
             results, reference=results_compare_to, atol_s=atol_s, rtol=rtol
         )
 
+
 def _is_oom_error(e):
-    return isinstance(e, (torch.cuda.OutOfMemoryError, triton.runtime.autotuner.OutOfResources))
+    return isinstance(
+        e, (torch.cuda.OutOfMemoryError, triton.runtime.autotuner.OutOfResources)
+    )
+
 
 def _fail_if_regressions(
     results: List[Any], reference: List[Any], atol_s: float, rtol: float
diff --git a/xformers/ops/common.py b/xformers/ops/common.py
index 2dad20691..e24b0dda5 100644
--- a/xformers/ops/common.py
+++ b/xformers/ops/common.py
@@ -38,7 +38,10 @@ class BaseOperator:
     @classmethod
     def is_available(cls) -> bool:
         # cls.OPERATOR can be either a kernel or a Triton Autotuner object, which doesn't have __name__
-        if cls.OPERATOR is None or getattr(cls.OPERATOR, "__name__", "") == "no_such_operator":
+        if (
+            cls.OPERATOR is None
+            or getattr(cls.OPERATOR, "__name__", "") == "no_such_operator"
+        ):
             return False
         return True
 
diff --git a/xformers/ops/fmha/__init__.py b/xformers/ops/fmha/__init__.py
index b1da96542..15712fe47 100644
--- a/xformers/ops/fmha/__init__.py
+++ b/xformers/ops/fmha/__init__.py
@@ -42,7 +42,8 @@
 TritonFlashAttentionOp = (triton.FwOp, cutlass.BwOp if torch.version.cuda else ck.BwOp)
 MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp)
 MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
-MemoryEfficientAttentionSplitKCkOp = (ck_splitk.FwOp, ck.BwOp) 
+MemoryEfficientAttentionSplitKCkOp = (ck_splitk.FwOp, ck.BwOp)
+
 
 class _fMHA(torch.autograd.Function):
     @staticmethod
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 268b0dd1f..e6750e88e 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -39,6 +39,7 @@
 def _minimum_gemm_alignment(inp: Inputs) -> int:
     return 1
 
+
 def _get_seqlen_info(
     inp: Inputs,
 ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int, int]:
@@ -58,7 +59,11 @@ def _get_seqlen_info(
         max_seqlen_q = -1
         ##max_seqlen_k = -1
 
-    return seqstart_k, seqstart_q, max_seqlen_q,
+    return (
+        seqstart_k,
+        seqstart_q,
+        max_seqlen_q,
+    )
 
 
 def _get_tensor_bias(
@@ -98,20 +103,22 @@ def _check_bias_alignment(
                 "you should call `.contiguous()` on the bias"
             )
 
+
 def _check_large_shapes(reasons: List[str], inp: Inputs) -> None:
     """CK kernel throws "Memory access fault by GPU node-2" when B * T >= 2**20, might be some index overflow.
     To reproduce, remove this function and run benchmark_mem_eff_attention with ParlAI model shape (256, 4096, 16, 64).
     This needs further debugging, for now let's not support such shapes.
     """
-    b_t_limit = 1024 ** 2
-    q_too_large = inp.query.shape[0] *  inp.query.shape[1] >= b_t_limit
-    k_too_large = inp.key.shape[0] *  inp.key.shape[1] >= b_t_limit
-    v_too_large = inp.value.shape[0] *  inp.value.shape[1] >= b_t_limit
+    b_t_limit = 1024**2
+    q_too_large = inp.query.shape[0] * inp.query.shape[1] >= b_t_limit
+    k_too_large = inp.key.shape[0] * inp.key.shape[1] >= b_t_limit
+    v_too_large = inp.value.shape[0] * inp.value.shape[1] >= b_t_limit
     if q_too_large or k_too_large or v_too_large:
         reasons.append(
             "Input is too large: product of first two dimensions of q/k/v must be < 2**20"
         )
 
+
 class _CustomMaskType(int, Enum):
     """
     (Matches CustomMaskType in C++.)
@@ -145,6 +152,7 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
         return int(_CustomMaskType.CausalFromBottomRight)
     return int(_CustomMaskType.NoCustomMask)
 
+
 # checking the availability of ck-tiled is necessary since ck-tiled does not
 # have the same functionalities as old-CK
 def is_ck_tiled() -> bool:
@@ -152,17 +160,17 @@ def is_ck_tiled() -> bool:
     ck_check_op = get_xformers_operator("is_ck_tiled_used")
     return ck_check_op()
 
+
 @register_operator
 class FwOp(AttentionFwOpBase):
-    """xFormers' MHA kernel based on Composable Kernel.
-    """
+    """xFormers' MHA kernel based on Composable Kernel."""
 
     OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
-    SUPPORTED_MAX_K = 256 
+    SUPPORTED_MAX_K = 256
 
-    if is_ck_tiled(): 
+    if is_ck_tiled():
         SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
             type(None),
             torch.Tensor,
@@ -187,7 +195,7 @@ class FwOp(AttentionFwOpBase):
             BlockDiagonalCausalMask,
             BlockDiagonalCausalWithOffsetPaddedKeysMask,
             attn_bias.BlockDiagonalCausalFromBottomRightMask,
-         }
+        }
 
     SUPPORTS_DROPOUT = False if is_ck_tiled() else True
     SUPPORTS_CUSTOM_SCALE = True
@@ -286,7 +294,11 @@ def apply_bmhk(
             raise NotImplementedError("Unsupported attn_bias type")
         seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
-            seqlen_k=inp.attn_bias.k_seqinfo.seqlen if is_ck_tiled() else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
+            seqlen_k = (
+                inp.attn_bias.k_seqinfo.seqlen
+                if is_ck_tiled()
+                else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
+            )
         out, lse, rng_seed, rng_offset = cls.OPERATOR(
             query=inp.query,
             key=inp.key,
@@ -338,7 +350,9 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         check_lastdim_alignment_stride1(reasons, "value", d.value, matmul_alignment_mn)
         _check_bias_alignment(reasons, d.attn_bias)
         _check_large_shapes(reasons, d)
-        requires_grad = d.query.requires_grad or d.key.requires_grad or d.value.requires_grad
+        requires_grad = (
+            d.query.requires_grad or d.key.requires_grad or d.value.requires_grad
+        )
         if is_ck_tiled() and requires_grad:
             reasons.append("Gradience is currently not supported by ck-tiled!")
         return reasons
@@ -449,7 +463,11 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         dtype = inp.query.dtype
 
         if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
-            seqlen_k=inp.attn_bias.k_seqinfo.seqlen if is_ck_tiled() else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
+            seqlen_k = (
+                inp.attn_bias.k_seqinfo.seqlen
+                if is_ck_tiled()
+                else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
+            )
 
         rng_seed = rng_offset = 0
         if inp.p != 0.0:
@@ -486,7 +504,6 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
         )
-        
 
         # c++/CUDA implementation returns an uninitialized tensor if bias doesn't
         # require grad
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 6b1d76f9c..14e6ba09a 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -14,11 +14,15 @@ class FwOp(AttentionFwOpBase):
     An operator optimized for K=256 (so the contiguous dim fits into registers).
     Tested to work on MI250x.
     """
+
     OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16, torch.float}
     SUPPORTED_MAX_K: int = 256
-    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {type(None), BlockDiagonalCausalWithOffsetPaddedKeysMask}
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    }
     SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_BMGHK = True
@@ -31,23 +35,29 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         attn_bias = d.attn_bias
         if isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
             if d.query.shape[0] != 1:
-                reasons.append(f"One formal batch element expected; got {d.query.shape[0]}")
+                reasons.append(
+                    f"One formal batch element expected; got {d.query.shape[0]}"
+                )
 
             if d.query.shape[-1] > cls.SUPPORTED_MAX_K:
-                reasons.append(f"Got head_dim={d.query.shape[-1]}; only head_dim<={cls.SUPPORTED_MAX_K} is supported for now.")
+                reasons.append(
+                    f"Got head_dim={d.query.shape[-1]}; only head_dim<={cls.SUPPORTED_MAX_K} is supported for now."
+                )
 
-            threads_per_warp = 64 # TODO: ideally query the platform here
+            threads_per_warp = 64  # TODO: ideally query the platform here
             required_alignment = 0
             head_dim = d.query.shape[-1]
             for vec_size in (4, 2, 1):
                 if head_dim <= vec_size * threads_per_warp:
                     required_alignment = vec_size
-            
+
             if not required_alignment:
                 reasons.append(f"Got head_dim={head_dim} which is too large")
-            
+
             if head_dim % required_alignment != 0:
-                reasons.append(f"Got head_dim={head_dim}; it needs to be divisible by {required_alignment}")
+                reasons.append(
+                    f"Got head_dim={head_dim}; it needs to be divisible by {required_alignment}"
+                )
 
             if d.key.stride(-1) != 1:
                 reasons.append("expect keys to have last dim contiguous")
@@ -98,7 +108,7 @@ def apply(
             else:
                 key = k[0].unflatten(0, (-1, padding))
                 value = v[0].unflatten(0, (-1, padding))
-            query = q[0].unflatten(0, (key.shape[0], -1))    
+            query = q[0].unflatten(0, (key.shape[0], -1))
         else:
             # key: (B, padding, G, 1 if multiquery else Hkv, D)
             # value: like key
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 3dd2fd7c7..63bdb1528 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -14,13 +14,13 @@
 
 @register_operator
 class FwOp(AttentionFwOpBase):
-    
+
     OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_splitk_ck")
     SUPPORTED_DEVICES = {"cuda"}
     SUPPORTED_DTYPES = {
         torch.half,
         torch.bfloat16,
-        torch.float
+        torch.float,
     }  # Those are dtypes of Q. In the quantized case K/V has dtype int32
     SUPPORTED_MAX_K = 256
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
@@ -105,7 +105,7 @@ def apply(
         attn_bias = inp.attn_bias
         seq_len = None
         q, k, v = inp.get_qkv_in_bmghk()
-        
+
         if attn_bias is not None:
             attn_bias.k_seqinfo.to(k.device)
             attn_bias.q_seqinfo.to(q.device)
@@ -126,7 +126,7 @@ def apply(
             else:
                 key = k[0].unflatten(0, (-1, padding))
                 value = v[0].unflatten(0, (-1, padding))
-            query = q[0].unflatten(0, (key.shape[0], -1))    
+            query = q[0].unflatten(0, (key.shape[0], -1))
         else:
             # key: (B, padding, G, 1 if multiquery else Hkv, D)
             # value: like key
@@ -149,8 +149,15 @@ def apply(
         else:
             qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32))
 
-        out = cls.OPERATOR(query=query, key=key, value=value, seq_positions=seq_positions_gpu, scale=qk_scale, split_k=split_k)
-        
+        out = cls.OPERATOR(
+            query=query,
+            key=key,
+            value=value,
+            seq_positions=seq_positions_gpu,
+            scale=qk_scale,
+            split_k=split_k,
+        )
+
         return out, None
 
 
diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index 18ad70be4..de38f6423 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -300,7 +300,11 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         dtype = d.query.dtype
         if device_type not in cls.SUPPORTED_DEVICES:
             reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
-        if device_type == "cuda" and not _built_with_cuda and (torch.version.hip is None):
+        if (
+            device_type == "cuda"
+            and not _built_with_cuda
+            and (torch.version.hip is None)
+        ):
             reasons.append("xFormers wasn't build with CUDA support")
         if device_type == "cuda":
             device_capability = torch.cuda.get_device_capability(d.device)
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 0af07b3e9..aaabe5c8c 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -81,21 +81,23 @@ def _dispatch_fw_priority_list(
 ) -> Sequence[Type[AttentionFwOpBase]]:
     if torch.version.cuda:
         priority_list_ops = deque(
-           [
-              flash.FwOp,
-              triton.FwOp,
-              cutlass.FwOp,
-              small_k.FwOp,
-           ])
+            [
+                flash.FwOp,
+                triton.FwOp,
+                cutlass.FwOp,
+                small_k.FwOp,
+            ]
+        )
         if _is_cutlass_fwd_faster_than_flash(inp):
             priority_list_ops.remove(cutlass.FwOp)
             priority_list_ops.appendleft(cutlass.FwOp)
     else:
         priority_list_ops = deque(
-           [
-              triton.FwOp,
-              ck.FwOp,
-           ])
+            [
+                triton.FwOp,
+                ck.FwOp,
+            ]
+        )
     if _is_triton_fwd_fastest(inp):
         priority_list_ops.remove(triton.FwOp)
         priority_list_ops.appendleft(triton.FwOp)
@@ -106,7 +108,9 @@ def _dispatch_fw_priority_list(
         if not mqa_or_gqa:
             # With multiquery, cutlass is sometimes faster than decoder
             # but it's not currently clear when.
-            priority_list_ops.appendleft(decoder.FwOp if torch.version.cuda else ck_decoder.FwOp)
+            priority_list_ops.appendleft(
+                decoder.FwOp if torch.version.cuda else ck_decoder.FwOp
+            )
         # Split-KV is useful with MQA
         # for short Q-seqlen / long K-seqlen
         if mqa_or_gqa and inp.query.shape[1] <= 32 and inp.key.shape[1] >= 256:

From 3b33c5d5dfc0957c15d083b698d093b905b91ff0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 6 Feb 2024 02:00:09 +0000
Subject: [PATCH 430/641] fix flake8 suggestions

---
 setup.py                                      |  2 +-
 tests/test_ck_7.py                            | 22 ++++-----
 tests/test_mem_eff_attention.py               | 17 +++----
 tests/test_mem_eff_attention_ck_discarded.py  | 13 ++---
 tests/test_mqa_forward_ck_tiled_discarded.py  | 10 ++--
 .../benchmark_mem_eff_atttention_mqa.py       | 16 ++++---
 xformers/benchmarks/utils.py                  | 47 -------------------
 xformers/ops/fmha/ck.py                       | 14 +++---
 xformers/ops/fmha/ck_splitk.py                |  1 -
 9 files changed, 44 insertions(+), 98 deletions(-)

diff --git a/setup.py b/setup.py
index 59867a805..14462cf74 100644
--- a/setup.py
+++ b/setup.py
@@ -240,7 +240,7 @@ def get_extensions():
         os.path.join(extensions_dir, "swiglu", "**", "*.cpp"), recursive=True
     )
 
-    ## avoid the temporary .cu file under xformers/csrc/attention/hip_fmha are included
+    # avoid the temporary .cu file under xformers/csrc/attention/hip_fmha are included
     source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"), recursive=False)
     source_cuda += glob.glob(
         os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True
diff --git a/tests/test_ck_7.py b/tests/test_ck_7.py
index 6f6124945..7477c3f70 100644
--- a/tests/test_ck_7.py
+++ b/tests/test_ck_7.py
@@ -3,14 +3,11 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
 import random
 from typing import List, Optional, Sequence, Tuple, Type, TypeVar
 
 import pytest
 import torch
-from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
 from xformers.ops import fmha
@@ -404,7 +401,8 @@ def create_attn_bias(
             # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
             #       with the data read by one-thread
             # make sure it also works if the first columns are partially masked out
-            ## attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
+            #
+            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
 
         if requires_grad:
             attn_bias.requires_grad_(True)
@@ -743,7 +741,7 @@ def test_backward(
     if k % 8 != 0 or kv % 8 != 0:
         pytest.skip("head-dim length must be an even value for CK-FlashAttention-1")
 
-    ## BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
+    # BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
     if (
         bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
         and q_len <= kv_len
@@ -755,9 +753,9 @@ def test_backward(
     if k != kv:
         pytest.skip("k same as kv is not well tested by CK-FlashAttention-1")
 
-    ## attn_bias_requires_grad = (
-    ##    random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
-    ##)
+    # attn_bias_requires_grad = (
+    #    random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
+    # )
     attn_bias_requires_grad = False
 
     query, key, value, attn_bias = create_tensors(
@@ -798,10 +796,10 @@ def test_backward(
     )
 
     grad_out = torch.ones_like(out)
-    ##if grad_out_contiguous is False:
-    ##    grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
-    ##        None, None, :
-    ##    ].expand_as(out)
+    # if grad_out_contiguous is False:
+    #    grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
+    #        None, None, :
+    #    ].expand_as(out)
 
     out.backward(grad_out)
 
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index ab4442f77..4a460ca3c 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -414,16 +414,16 @@ def attn_bias_group(group: int):
     def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         p_slice = q_whole @ k_slice.transpose(-2, -1)
         p_slice += attn_bias_slice
-        m = torch.max(p_slice, dim=-1, keepdim=True).values
-        p_slice_scaled = p_slice - m
+        row_max = torch.max(p_slice, dim=-1, keepdim=True).values
+        p_slice_scaled = p_slice - row_max
         p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
         s = torch.exp(p_slice_scaled)
-        l = torch.sum(s, dim=-1, keepdim=True)
+        row_sumexp = torch.sum(s, dim=-1, keepdim=True)
         attn_slice = s @ v_slice
         return {
             "attn_slice": attn_slice,
-            "row_max": m,
-            "row_lse": l,
+            "row_max": row_max,
+            "row_sumexp": row_sumexp,
         }
 
     splits = list(zip(k_split, v_split, attn_bias_split))
@@ -434,12 +434,12 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     # reduce out over split-k slices
 
     global_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
-    global_sumexp = torch.zeros_like(slices[0]["row_lse"])
+    global_sumexp = torch.zeros_like(slices[0]["row_sumexp"])
 
     for s in slices:
         local_out = s["attn_slice"]
         local_max = s["row_max"]
-        local_sumexp = s["row_lse"]
+        local_sumexp = s["row_sumexp"]
 
         log_alpha = -torch.abs(local_max - global_max)
         alpha = torch.exp(log_alpha)
@@ -456,7 +456,7 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
     return out
 
 
-## this interface assumes the tensor is in BMHK, but q and k/v might have different number of heads
+# this interface assumes the tensor is in BMHK, but q and k/v might have different number of heads
 def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
     assert q.ndim == 4
 
@@ -777,6 +777,7 @@ def test_mqa_forward(
         err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
         # Ensure we free memory to avoid OOMs
         del query, key, value, attn_bias, inputs
+        assert False, err_msg
 
     out = xformers.ops.memory_efficient_attention_forward(
         query, key, value, attn_bias, op=op
diff --git a/tests/test_mem_eff_attention_ck_discarded.py b/tests/test_mem_eff_attention_ck_discarded.py
index 2c91ad1d9..2879e6946 100644
--- a/tests/test_mem_eff_attention_ck_discarded.py
+++ b/tests/test_mem_eff_attention_ck_discarded.py
@@ -16,7 +16,6 @@
 import xformers.ops
 from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
-from xformers.ops.fmha import ALL_BW_OPS, ALL_FW_OPS
 from xformers.ops.fmha.common import AttentionOpBase
 from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
 
@@ -390,12 +389,12 @@ def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
         p_slice_scaled = p_slice - m
         p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
         s = torch.exp(p_slice_scaled)
-        l = torch.sum(s, dim=-1, keepdim=True)
+        l1 = torch.sum(s, dim=-1, keepdim=True)
         attn_slice = s @ v_slice
         return {
             "attn_slice": attn_slice,
             "row_max": m,
-            "row_lse": l,
+            "row_lse": l1,
         }
 
     splits = list(zip(k_split, v_split, attn_bias_split))
@@ -767,7 +766,7 @@ def test_backward(
         kv,
     ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    ## ToDo: reopen bfloat16 for testing
+    # ToDo: reopen bfloat16 for testing
     if dtype is torch.bfloat16:
         pytest.skip(
             "Temporarily disabled bfloat16 as we are still improving the accuracy of the results"
@@ -942,9 +941,9 @@ def _vec_binom_test(x, n, p):
 def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
     if op == fmha.ck.FwOp:
         mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
-        ## rand_uniform is an int32 tensor
+        # rand_uniform is an int32 tensor
         rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
-        ##mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
+        # mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
         mask = (rand_uniform <= int((1.0 - p) * 255.0)).to(torch.float32)
         mask = mask.reshape(batch_size, q_len, kv_len)
     else:
@@ -1013,8 +1012,6 @@ def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias
 
 
 def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
-    if dtype is torch.bfloat16 and compute_capability < (8, 0):
-        pytest.skip("bf16 requires Sm80")
     if not op.is_available():
         pytest.skip()
 
diff --git a/tests/test_mqa_forward_ck_tiled_discarded.py b/tests/test_mqa_forward_ck_tiled_discarded.py
index a1823dfd6..c40bd5708 100644
--- a/tests/test_mqa_forward_ck_tiled_discarded.py
+++ b/tests/test_mqa_forward_ck_tiled_discarded.py
@@ -3,20 +3,15 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
+from typing import Sequence, Type, TypeVar
 
 import pytest
 import torch
-from scipy.stats import binomtest
-from torch.utils.checkpoint import checkpoint
 
 import xformers.ops
 from xformers.attn_bias_utils import create_attn_bias
 from xformers.ops import fmha
 from xformers.ops.common import get_xformers_operator
-from xformers.ops.fmha.common import AttentionOpBase
 
 from .utils import assert_allclose
 
@@ -34,7 +29,7 @@
     fmha.ck.FwOp,
 ]
 
-### ck_check_op is temporarily used to check ck-tiled availability
+# ck_check_op is temporarily used to check ck-tiled availability
 ck_check_op = get_xformers_operator("is_ck_tiled_used")
 use_ck_tiled = ck_check_op()
 
@@ -193,6 +188,7 @@ def test_mqa_forward(
         err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
         # Ensure we free memory to avoid OOMs
         del query, key, value, attn_bias, inputs
+        assert False, err_msg
 
     out = xformers.ops.memory_efficient_attention_forward(
         query, key, value, attn_bias, op=op
diff --git a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
index ae6f11b15..4e4c47e38 100644
--- a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
+++ b/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
@@ -18,7 +18,8 @@
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
-## this interface assumes the tensor is in BMHK, but q and k/v might has different number of heads
+
+# this interface assumes the tensor is in BMHK, but q and k/v might has different number of heads
 def ref_attention_mqa(
     q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None
 ):
@@ -90,7 +91,7 @@ def attn_bias_head(head: int):
     return attn @ v
 
 
-## ref_attention_bmhk is completely the same as used by test_forward_ck_tiled.py
+# ref_attention_bmhk is completely the same as used by test_forward_ck_tiled.py
 def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
     assert q.ndim == 4
 
@@ -124,9 +125,9 @@ def T(t):
     (1, 1024, 1024, 64, 8, 64),
     (1, 1024, 1024, 8, 1, 64),
     (1, 1024, 1024, 4, 4, 64),
-    ##*sorted(itertools.product([1, 2], [2048, 4096], [2048, 4096], [4, 8], [1, 2], [128])),
-    ##*sorted(
-    ##    itertools.product([16], [128, 512], [512, 1024], [16], [2, 4], [64, 128])
+    # *sorted(itertools.product([1, 2], [2048, 4096], [2048, 4096], [4, 8], [1, 2], [128])),
+    # *sorted(
+    #    itertools.product([16], [128, 512], [512, 1024], [16], [2, 4], [64, 128])
     # ),
 ]
 
@@ -135,7 +136,8 @@ def T(t):
     xformers.ops.fmha.flash.FwOp,
     # TODO: Triton is not stable: it can trigger Illegal Memory Accesses
     # and its performance varies a lot between runs.
-    ##xformers.ops.fmha.triton.FwOp,
+    #
+    # xformers.ops.fmha.triton.FwOp,
 ]
 
 
@@ -199,7 +201,7 @@ def mem_eff_attention_fw(shape, num_threads: int, attn_bias_type, dropout_p, dty
         dtype=dtype,
         requires_grad=False,
         fmt="BMHK",
-        op=fmha.ck.FwOp,  ## only required as a refer op by create_attn_bias
+        op=fmha.ck.FwOp,  # only required as a refer op by create_attn_bias
     )
     inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
 
diff --git a/xformers/benchmarks/utils.py b/xformers/benchmarks/utils.py
index 0c94df1b6..31c6eb688 100644
--- a/xformers/benchmarks/utils.py
+++ b/xformers/benchmarks/utils.py
@@ -445,53 +445,6 @@ def benchmark_main_helper(benchmark_fn, cases: List[Dict[str, Any]], **kwargs) -
     )
 
 
-def benchmark_main_helper2(
-    name: str,
-    functions,
-    fw: bool = False,
-    bw: bool = False,
-    cuda_graph: bool = True,
-    **kwargs,
-) -> None:
-    assert fw or bw
-
-    def handle_case(**case) -> Iterator[benchmark.Timer]:
-        for k, benchmark_cls in functions.items():
-            benchmark_object = benchmark_cls(**case, bw=bw)
-            label = benchmark_object.label
-            label += "fw" if fw else ""
-            label += "bw" if bw else ""
-
-            def run_one():
-                if fw:
-                    benchmark_object.fw()
-                if bw:
-                    benchmark_object.bw()
-
-            if cuda_graph:
-                run_one()
-                benchmark_object = benchmark_cls(**case, bw=bw)
-                g = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(g):
-                    run_one()
-
-                def run_one():
-                    g.replay()
-
-            yield benchmark.Timer(
-                stmt="fn()",
-                globals={
-                    "fn": run_one,
-                },
-                label=label,
-                description=k,
-                sub_label=benchmark_object.sub_label,
-            )
-
-    handle_case.__name__ = name
-    benchmark_main_helper(handle_case, **kwargs)
-
-
 def benchmark_run_and_compare(
     benchmark_fn,
     cases: List[Dict[str, Any]],
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index e6750e88e..625caa7e6 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -47,17 +47,17 @@ def _get_seqlen_info(
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
-        ##attn_bias.k_seqinfo.to(inp.query.device)
-        ##attn_bias.q_seqinfo.to(inp.query.device)
+        # attn_bias.k_seqinfo.to(inp.query.device)
+        # attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
-        ##max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
+        # max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
     else:
         seqstart_k = None
         seqstart_q = None
         max_seqlen_q = -1
-        ##max_seqlen_k = -1
+        # max_seqlen_k = -1
 
     return (
         seqstart_k,
@@ -156,7 +156,7 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
 # checking the availability of ck-tiled is necessary since ck-tiled does not
 # have the same functionalities as old-CK
 def is_ck_tiled() -> bool:
-    ### ck_check_op is temporarily used to check ck-tiled availability
+    # ck_check_op is temporarily used to check ck-tiled availability
     ck_check_op = get_xformers_operator("is_ck_tiled_used")
     return ck_check_op()
 
@@ -394,7 +394,7 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
-        ##LowerTriangularFromBottomRightMask,
+        # LowerTriangularFromBottomRightMask,
         # TODO: Still some infs/nans in the BW pass for
         # local + causal
         # LowerTriangularFromBottomRightLocalAttentionMask,
@@ -403,7 +403,7 @@ class BwOp(AttentionBwOpBase):
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        ##attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        # attn_bias.BlockDiagonalCausalLocalAttentionMask,
     }
     SUPPORTS_ATTN_BIAS_GRAD = True
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 63bdb1528..87db094b2 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -103,7 +103,6 @@ def apply(
         cls, inp: Inputs, needs_gradient: bool
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         attn_bias = inp.attn_bias
-        seq_len = None
         q, k, v = inp.get_qkv_in_bmghk()
 
         if attn_bias is not None:

From 0a9c933f4896053fb7e2c8e23c5cf07739a1a779 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 6 Feb 2024 02:10:11 +0000
Subject: [PATCH 431/641] add license headers and reapply black

---
 xformers/ops/fmha/ck_decoder.py | 8 ++++++--
 xformers/ops/fmha/ck_splitk.py  | 5 +++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 14e6ba09a..0da84d441 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -1,4 +1,8 @@
-# TODO(max): add a proper copyright header
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Any, List, Optional, Set, Tuple
 
 import torch
@@ -69,7 +73,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             padding = attn_bias.k_seqinfo.padding
             bsz = d.key.shape[1] // padding
             num_queries = d.query.shape[1] // bsz
-            
+
             if q_starts != list(range(0, 1 + bsz, num_queries)):
                 reasons.append("expect to have same num_queries in each batch")
             if bsz != len(q_starts) - 1:
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 87db094b2..249edd533 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -1,3 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Any, List, Optional, Set, Tuple
 
 import torch

From 28d3672973f7e7778237246531ca861243cdbbef Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 6 Feb 2024 16:05:44 +0000
Subject: [PATCH 432/641] Tiny update to rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 6d36a7e97..f2593d53a 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -57,7 +57,7 @@ jobs:
 
     - name: Run python tests
       run: |
-        pytest -rpfs /xformers/tests/test_mem_eff_attention_ck.py | tee test_mem_eff_attention_ck.log
+        pytest -rpfs /xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
     
     - name: Archive logs
       uses: actions/upload-artifact@v3

From 12fb41c2460909285102426ca9ab52162725d64b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 6 Feb 2024 20:08:59 +0000
Subject: [PATCH 433/641] Add conditional compiling for cuda-depending codes in
 ROCM

---
 xformers/csrc/attention/matmul.cpp         | 2 ++
 xformers/csrc/attention/sddmm.cpp          | 2 ++
 xformers/csrc/attention/sparse_softmax.cpp | 2 ++
 xformers/csrc/attention/spmm.cpp           | 2 ++
 xformers/csrc/swiglu/swiglu_op.cpp         | 2 ++
 xformers/csrc/swiglu/swiglu_packedw.cpp    | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/xformers/csrc/attention/matmul.cpp b/xformers/csrc/attention/matmul.cpp
index 284191263..e5c7deb1d 100644
--- a/xformers/csrc/attention/matmul.cpp
+++ b/xformers/csrc/attention/matmul.cpp
@@ -35,8 +35,10 @@ at::Tensor matmul_with_mask(
 }
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::matmul_with_mask(Tensor a, Tensor b, Tensor mask) -> Tensor"));
+#endif
 }
 
 TORCH_LIBRARY_IMPL(xformers, CPU, m) {
diff --git a/xformers/csrc/attention/sddmm.cpp b/xformers/csrc/attention/sddmm.cpp
index 7b5e7e330..f4b810b0a 100644
--- a/xformers/csrc/attention/sddmm.cpp
+++ b/xformers/csrc/attention/sddmm.cpp
@@ -9,6 +9,8 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::sddmm_sputnik(Tensor a, Tensor b, Tensor row_indices, Tensor row_offsets, Tensor column_indices) -> Tensor"));
+#endif
 }
diff --git a/xformers/csrc/attention/sparse_softmax.cpp b/xformers/csrc/attention/sparse_softmax.cpp
index 826e3641e..074e670e3 100644
--- a/xformers/csrc/attention/sparse_softmax.cpp
+++ b/xformers/csrc/attention/sparse_softmax.cpp
@@ -9,8 +9,10 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::sparse_softmax_sputnik(int m, int n, Tensor row_indices, Tensor values, Tensor row_offsets, Tensor column_indices) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::sparse_softmax_backward_sputnik(int m, int n, Tensor row_indices, Tensor values, Tensor gradient, Tensor row_offsets, Tensor column_indices) -> Tensor"));
+#endif
 }
diff --git a/xformers/csrc/attention/spmm.cpp b/xformers/csrc/attention/spmm.cpp
index fbe7e1bf9..06271e6c0 100644
--- a/xformers/csrc/attention/spmm.cpp
+++ b/xformers/csrc/attention/spmm.cpp
@@ -9,6 +9,8 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::spmm_sputnik(Tensor b, Tensor row_indices, Tensor values, Tensor row_offsets, Tensor column_indices, int m) -> Tensor"));
+#endif
 }
diff --git a/xformers/csrc/swiglu/swiglu_op.cpp b/xformers/csrc/swiglu/swiglu_op.cpp
index a8880acf6..6f1ef4d7a 100644
--- a/xformers/csrc/swiglu/swiglu_op.cpp
+++ b/xformers/csrc/swiglu/swiglu_op.cpp
@@ -8,10 +8,12 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::dual_gemm_silu_identity_mul(Tensor x, Tensor w1, Tensor? b1, Tensor w2, Tensor? b2) -> (Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::silu_bw_fused(Tensor x1, Tensor x2, Tensor dx4) -> (Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::gemm_fused_operand_sum(Tensor a, Tensor b, Tensor out_mm, Tensor out_sum) -> (Tensor, Tensor)"));
+#endif
 }
diff --git a/xformers/csrc/swiglu/swiglu_packedw.cpp b/xformers/csrc/swiglu/swiglu_packedw.cpp
index 00fbef12a..65e3e22a8 100644
--- a/xformers/csrc/swiglu/swiglu_packedw.cpp
+++ b/xformers/csrc/swiglu/swiglu_packedw.cpp
@@ -221,8 +221,10 @@ at::Tensor swiglu_packedw_cuda(
 } // namespace
 
 TORCH_LIBRARY(xformers, m) {
+#if !defined(USE_ROCM)
   m.def(
       "swiglu_packedw(Tensor x, Tensor w1w2, Tensor? b1b2, Tensor w3, Tensor? b3) -> Tensor");
+#endif
 }
 
 TORCH_LIBRARY_IMPL(xformers, Autograd, m) {

From a9d83c6cc0267ba3bfd0777fc1821e13db1a7aca Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 00:21:28 +0000
Subject: [PATCH 434/641] Update to benchmark scripts

---
 xformers/benchmarks/benchmark_attn_decoding.py |  8 ++++++--
 xformers/benchmarks/benchmark_core.py          | 12 +++++++-----
 xformers/benchmarks/benchmark_nystrom_utils.py |  4 +++-
 xformers/benchmarks/benchmark_sddmm.py         | 15 +++++++++------
 xformers/benchmarks/benchmark_swiglu.py        |  8 +++++---
 xformers/benchmarks/benchmark_transformer.py   |  6 ++++--
 6 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 31883008b..abfb6ae62 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+import sys
 
 from typing import Any
 
@@ -128,11 +129,14 @@ def fw(self) -> None:
     "pytorch": AttentionDecodingPyTorchRepeat,
     "ck": AttentionDecodingCK,
     "ck-decoder": AttentionDecodingCKDecoder,
-    "flash-decoding": AttentionDecodingFlashDecoding,
-    "triton_splitK": AttentionDecodingSplitKV,
     "ck_splitK": AttentionDecodingCKSplitKV,
 }
 
+if torch.version.cuda:
+    BENCHMARKS["flash-decoding"] = AttentionDecodingFlashDecoding
+
+if (sys.version_info.major, sys.version_info.minor) >= (3, 9): 
+    BENCHMARKS["triton_splitK"] = AttentionDecodingSplitKV
 
 try:
     import flash_attn
diff --git a/xformers/benchmarks/benchmark_core.py b/xformers/benchmarks/benchmark_core.py
index 97cdefa09..ee14c4cb4 100644
--- a/xformers/benchmarks/benchmark_core.py
+++ b/xformers/benchmarks/benchmark_core.py
@@ -251,8 +251,10 @@ def bench_bmm():
     compare = benchmark.Compare(results)
     compare.print()
 
-
-bench_sddmm()
-bench_matmul_with_mask()
-bench_softmax()
-bench_bmm()
+if torch.version.hip:
+    print("This benchmark could not be done on ROCM!")
+else:
+    bench_sddmm()
+    bench_matmul_with_mask()
+    bench_softmax()
+    bench_bmm()
diff --git a/xformers/benchmarks/benchmark_nystrom_utils.py b/xformers/benchmarks/benchmark_nystrom_utils.py
index 6f4b38c84..c85b03456 100644
--- a/xformers/benchmarks/benchmark_nystrom_utils.py
+++ b/xformers/benchmarks/benchmark_nystrom_utils.py
@@ -93,7 +93,9 @@ def iterative_pinv_analysis(
                 break
 
 
-if __name__ == "__main__":
+if torch.version.hip:
+    print("This benchmark could not be done on ROCM!")
+else:
     iterative_pinv_analysis()
     bench_inverse(iterative_pinv)
     bench_inverse(torch.linalg.pinv)
diff --git a/xformers/benchmarks/benchmark_sddmm.py b/xformers/benchmarks/benchmark_sddmm.py
index 693e4a623..536fc5ef8 100644
--- a/xformers/benchmarks/benchmark_sddmm.py
+++ b/xformers/benchmarks/benchmark_sddmm.py
@@ -109,9 +109,12 @@ def bench_sddmm(configs):
 
 results = []
 
-print("Swin Transformer")
-results += bench_sddmm(swin_t_config)
-print("ViT")
-results += bench_sddmm(vit_config)
-print("Basic cases")
-results += bench_sddmm(basic_config)
+if torch.version.hip:
+    print("This benchmark could not be done on ROCM!")
+else:
+    print("Swin Transformer")
+    results += bench_sddmm(swin_t_config)
+    print("ViT")
+    results += bench_sddmm(vit_config)
+    print("Basic cases")
+    results += bench_sddmm(basic_config)
diff --git a/xformers/benchmarks/benchmark_swiglu.py b/xformers/benchmarks/benchmark_swiglu.py
index b268d3f19..a0c026fd5 100644
--- a/xformers/benchmarks/benchmark_swiglu.py
+++ b/xformers/benchmarks/benchmark_swiglu.py
@@ -155,6 +155,8 @@ def benchmark_swiglu_bw(shape, dtype, bias: bool):
         sub_label=sub_label,
     )
 
-
-benchmark_main_helper(benchmark_swiglu, CASES, min_run_time=min_run_time)
-benchmark_main_helper(benchmark_swiglu_bw, CASES, min_run_time=min_run_time)
+if torch.version.hip:
+    print("This benchmark could not be done on ROCM!")
+else:
+    benchmark_main_helper(benchmark_swiglu, CASES, min_run_time=min_run_time)
+    benchmark_main_helper(benchmark_swiglu_bw, CASES, min_run_time=min_run_time)
diff --git a/xformers/benchmarks/benchmark_transformer.py b/xformers/benchmarks/benchmark_transformer.py
index 2a6070b62..2243cacf4 100644
--- a/xformers/benchmarks/benchmark_transformer.py
+++ b/xformers/benchmarks/benchmark_transformer.py
@@ -152,5 +152,7 @@ def benchmark_transformer(model_info, dtype) -> Iterator[benchmark.Timer]:
             sub_label=model_name,
         )
 
-
-benchmark_main_helper(benchmark_transformer, CASES)
+if torch.version.hip:
+    print("This benchmark could not be done on ROCM!")
+else:
+    benchmark_main_helper(benchmark_transformer, CASES)

From 9ab383110e660b653faf018f49d623f6f3146d17 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 14:28:41 +0000
Subject: [PATCH 435/641] Rename the one script file

---
 ...m_eff_atttention_mqa.py => benchmark_mem_eff_attention_mqa.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename xformers/benchmarks/{benchmark_mem_eff_atttention_mqa.py => benchmark_mem_eff_attention_mqa.py} (100%)

diff --git a/xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py b/xformers/benchmarks/benchmark_mem_eff_attention_mqa.py
similarity index 100%
rename from xformers/benchmarks/benchmark_mem_eff_atttention_mqa.py
rename to xformers/benchmarks/benchmark_mem_eff_attention_mqa.py

From 243dc6a0ef3907ab1903ca84f91ce72b36c70e41 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 15:07:21 +0000
Subject: [PATCH 436/641] Revert "Add conditional compiling for cuda-depending
 codes in ROCM"

This reverts commit 12fb41c2460909285102426ca9ab52162725d64b.
---
 xformers/csrc/attention/matmul.cpp         | 2 --
 xformers/csrc/attention/sddmm.cpp          | 2 --
 xformers/csrc/attention/sparse_softmax.cpp | 2 --
 xformers/csrc/attention/spmm.cpp           | 2 --
 xformers/csrc/swiglu/swiglu_op.cpp         | 2 --
 xformers/csrc/swiglu/swiglu_packedw.cpp    | 2 --
 6 files changed, 12 deletions(-)

diff --git a/xformers/csrc/attention/matmul.cpp b/xformers/csrc/attention/matmul.cpp
index e5c7deb1d..284191263 100644
--- a/xformers/csrc/attention/matmul.cpp
+++ b/xformers/csrc/attention/matmul.cpp
@@ -35,10 +35,8 @@ at::Tensor matmul_with_mask(
 }
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
-#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::matmul_with_mask(Tensor a, Tensor b, Tensor mask) -> Tensor"));
-#endif
 }
 
 TORCH_LIBRARY_IMPL(xformers, CPU, m) {
diff --git a/xformers/csrc/attention/sddmm.cpp b/xformers/csrc/attention/sddmm.cpp
index f4b810b0a..7b5e7e330 100644
--- a/xformers/csrc/attention/sddmm.cpp
+++ b/xformers/csrc/attention/sddmm.cpp
@@ -9,8 +9,6 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
-#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::sddmm_sputnik(Tensor a, Tensor b, Tensor row_indices, Tensor row_offsets, Tensor column_indices) -> Tensor"));
-#endif
 }
diff --git a/xformers/csrc/attention/sparse_softmax.cpp b/xformers/csrc/attention/sparse_softmax.cpp
index 074e670e3..826e3641e 100644
--- a/xformers/csrc/attention/sparse_softmax.cpp
+++ b/xformers/csrc/attention/sparse_softmax.cpp
@@ -9,10 +9,8 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
-#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::sparse_softmax_sputnik(int m, int n, Tensor row_indices, Tensor values, Tensor row_offsets, Tensor column_indices) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::sparse_softmax_backward_sputnik(int m, int n, Tensor row_indices, Tensor values, Tensor gradient, Tensor row_offsets, Tensor column_indices) -> Tensor"));
-#endif
 }
diff --git a/xformers/csrc/attention/spmm.cpp b/xformers/csrc/attention/spmm.cpp
index 06271e6c0..fbe7e1bf9 100644
--- a/xformers/csrc/attention/spmm.cpp
+++ b/xformers/csrc/attention/spmm.cpp
@@ -9,8 +9,6 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
-#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::spmm_sputnik(Tensor b, Tensor row_indices, Tensor values, Tensor row_offsets, Tensor column_indices, int m) -> Tensor"));
-#endif
 }
diff --git a/xformers/csrc/swiglu/swiglu_op.cpp b/xformers/csrc/swiglu/swiglu_op.cpp
index 6f1ef4d7a..a8880acf6 100644
--- a/xformers/csrc/swiglu/swiglu_op.cpp
+++ b/xformers/csrc/swiglu/swiglu_op.cpp
@@ -8,12 +8,10 @@
 #include <torch/types.h>
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
-#if !defined(USE_ROCM)
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::dual_gemm_silu_identity_mul(Tensor x, Tensor w1, Tensor? b1, Tensor w2, Tensor? b2) -> (Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::silu_bw_fused(Tensor x1, Tensor x2, Tensor dx4) -> (Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::gemm_fused_operand_sum(Tensor a, Tensor b, Tensor out_mm, Tensor out_sum) -> (Tensor, Tensor)"));
-#endif
 }
diff --git a/xformers/csrc/swiglu/swiglu_packedw.cpp b/xformers/csrc/swiglu/swiglu_packedw.cpp
index 65e3e22a8..00fbef12a 100644
--- a/xformers/csrc/swiglu/swiglu_packedw.cpp
+++ b/xformers/csrc/swiglu/swiglu_packedw.cpp
@@ -221,10 +221,8 @@ at::Tensor swiglu_packedw_cuda(
 } // namespace
 
 TORCH_LIBRARY(xformers, m) {
-#if !defined(USE_ROCM)
   m.def(
       "swiglu_packedw(Tensor x, Tensor w1w2, Tensor? b1b2, Tensor w3, Tensor? b3) -> Tensor");
-#endif
 }
 
 TORCH_LIBRARY_IMPL(xformers, Autograd, m) {

From 3240ba19f2fb086ab51ebfc280e66bcb66b28416 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 16:05:57 +0000
Subject: [PATCH 437/641] Update to scripts

---
 tests/test_checkpoint.py                         |  8 +++++---
 xformers/benchmarks/LRA/run_tasks.py             | 16 ++++++++++------
 xformers/benchmarks/benchmark_attn_decoding.py   |  9 ++++-----
 .../benchmark_blocksparse_transformers.py        |  4 ++--
 xformers/benchmarks/benchmark_core.py            |  1 +
 xformers/benchmarks/benchmark_indexing.py        |  2 +-
 .../benchmarks/benchmark_mem_eff_attention.py    |  8 +++++---
 .../benchmarks/benchmark_mem_eff_attn_decoder.py |  8 +++++---
 xformers/benchmarks/benchmark_swiglu.py          |  1 +
 xformers/benchmarks/benchmark_transformer.py     |  1 +
 xformers/benchmarks/utils.py                     | 14 ++++++++------
 xformers/ops/fmha/ck.py                          |  6 +-----
 12 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 8e456d345..722a3eb8c 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -106,9 +106,11 @@ def test_checkpoint_with_grad(policy_fn, input_requires_grad, grad_mode):
     "op",
     [
         xformers.ops.MemoryEfficientAttentionFlashAttentionOp,
-        xformers.ops.MemoryEfficientAttentionCutlassOp
-        if torch.version.cuda
-        else xformers.ops.MemoryEfficientAttentionCkOp,
+        (
+            xformers.ops.MemoryEfficientAttentionCutlassOp
+            if torch.version.cuda
+            else xformers.ops.MemoryEfficientAttentionCkOp
+        ),
     ],
 )
 def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast, op):
diff --git a/xformers/benchmarks/LRA/run_tasks.py b/xformers/benchmarks/LRA/run_tasks.py
index e9d1f7284..41c5fbe55 100644
--- a/xformers/benchmarks/LRA/run_tasks.py
+++ b/xformers/benchmarks/LRA/run_tasks.py
@@ -53,9 +53,11 @@ def build_model(args: argparse.Namespace, config: Dict) -> nn.Module:
 
     model = cast(
         pl.LightningModule,
-        ModelForSCDual(config[f"{task}"], attention_name)
-        if task == Task.Retrieval
-        else ModelForSC(config[f"{task}"], attention_name),
+        (
+            ModelForSCDual(config[f"{task}"], attention_name)
+            if task == Task.Retrieval
+            else ModelForSC(config[f"{task}"], attention_name)
+        ),
     )
 
     logging.info(model)
@@ -252,9 +254,11 @@ def benchmark(args):
 
     trainer = pl.Trainer(
         accelerator="gpu",
-        strategy=DDPStrategy(find_unused_parameters=args.debug)
-        if not args.skip_train
-        else None,
+        strategy=(
+            DDPStrategy(find_unused_parameters=args.debug)
+            if not args.skip_train
+            else None
+        ),
         accumulate_grad_batches=config_training["gradient_accumulation"],
         callbacks=[progress_bar, checkpoint_callback],
         detect_anomaly=args.debug,
diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index abfb6ae62..3c30e5702 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import sys
-
 from typing import Any
 
 import torch
@@ -135,7 +134,7 @@ def fw(self) -> None:
 if torch.version.cuda:
     BENCHMARKS["flash-decoding"] = AttentionDecodingFlashDecoding
 
-if (sys.version_info.major, sys.version_info.minor) >= (3, 9): 
+if (sys.version_info.major, sys.version_info.minor) >= (3, 9):
     BENCHMARKS["triton_splitK"] = AttentionDecodingSplitKV
 
 try:
@@ -152,9 +151,9 @@ def fw(self) -> None:
                 v = v[:, :, :, 0]
             return flash_attn.flash_attn_func(q, k, v)
 
-    BENCHMARKS[
-        f"flash-attention@{flash_attn.__version__}"
-    ] = AttentionDecodingFlashAttention
+    BENCHMARKS[f"flash-attention@{flash_attn.__version__}"] = (
+        AttentionDecodingFlashAttention
+    )
 except ImportError:
     pass
 
diff --git a/xformers/benchmarks/benchmark_blocksparse_transformers.py b/xformers/benchmarks/benchmark_blocksparse_transformers.py
index f9cb72a15..3cdd9a369 100644
--- a/xformers/benchmarks/benchmark_blocksparse_transformers.py
+++ b/xformers/benchmarks/benchmark_blocksparse_transformers.py
@@ -60,7 +60,7 @@ def get_mask(MaskGenType, config, config_setter=[]):
 
     # Get the mask
     mask_generator = MaskGenType(mask_config)
-    for (key, value) in config_setter:
+    for key, value in config_setter:
         mask_generator.set_config_attr(key, value)
     if not mask_generator.is_valid_config():
         return None
@@ -73,7 +73,7 @@ def densify_mask(mask, config):
     seq_length = config.seq_length
     block_size = config.block_size
     dense_mask = torch.zeros(num_heads, seq_length, seq_length)
-    for (h, i, j) in zip(*mask.nonzero(as_tuple=True)):
+    for h, i, j in zip(*mask.nonzero(as_tuple=True)):
         dense_mask[
             h,
             i * block_size : (i + 1) * block_size,
diff --git a/xformers/benchmarks/benchmark_core.py b/xformers/benchmarks/benchmark_core.py
index ee14c4cb4..2a4d67560 100644
--- a/xformers/benchmarks/benchmark_core.py
+++ b/xformers/benchmarks/benchmark_core.py
@@ -251,6 +251,7 @@ def bench_bmm():
     compare = benchmark.Compare(results)
     compare.print()
 
+
 if torch.version.hip:
     print("This benchmark could not be done on ROCM!")
 else:
diff --git a/xformers/benchmarks/benchmark_indexing.py b/xformers/benchmarks/benchmark_indexing.py
index ed1e71001..353b9dba7 100644
--- a/xformers/benchmarks/benchmark_indexing.py
+++ b/xformers/benchmarks/benchmark_indexing.py
@@ -111,7 +111,7 @@ def __init__(self, dtype, batches, D, keep_ratio, bw: bool) -> None:
 
         indices = []
         sources = []
-        for (B, seqlen) in batches:
+        for B, seqlen in batches:
             index = [i for i in range(B)]
             random.Random(B).shuffle(index)
             indices.append(
diff --git a/xformers/benchmarks/benchmark_mem_eff_attention.py b/xformers/benchmarks/benchmark_mem_eff_attention.py
index 5c5305a16..bbeb22264 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attention.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attention.py
@@ -113,9 +113,11 @@ class TritonFlashAttentionFwAutotuned(xformers.ops.fmha.triton.FwOp):
     (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
     (
         TritonFlashAttentionFwAutotuned,
-        xformers.ops.fmha.cutlass.BwOp
-        if torch.version.cuda
-        else xformers.ops.fmha.ck.BwOp,
+        (
+            xformers.ops.fmha.cutlass.BwOp
+            if torch.version.cuda
+            else xformers.ops.fmha.ck.BwOp
+        ),
     ),
 ]
 
diff --git a/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py b/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
index 7616d702d..67698c87c 100644
--- a/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
+++ b/xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
@@ -60,9 +60,11 @@ def T(t):
 
 OPS = [
     xformers.ops.fmha.cutlass.FwOp if torch.version.cuda else xformers.ops.fmha.ck.FwOp,
-    xformers.ops.fmha.decoder.FwOp
-    if torch.version.cuda
-    else xformers.ops.fmha.ck_decoder.FwOp,
+    (
+        xformers.ops.fmha.decoder.FwOp
+        if torch.version.cuda
+        else xformers.ops.fmha.ck_decoder.FwOp
+    ),
 ]
 
 KV_SHAPES = [
diff --git a/xformers/benchmarks/benchmark_swiglu.py b/xformers/benchmarks/benchmark_swiglu.py
index a0c026fd5..b28367334 100644
--- a/xformers/benchmarks/benchmark_swiglu.py
+++ b/xformers/benchmarks/benchmark_swiglu.py
@@ -155,6 +155,7 @@ def benchmark_swiglu_bw(shape, dtype, bias: bool):
         sub_label=sub_label,
     )
 
+
 if torch.version.hip:
     print("This benchmark could not be done on ROCM!")
 else:
diff --git a/xformers/benchmarks/benchmark_transformer.py b/xformers/benchmarks/benchmark_transformer.py
index 2243cacf4..4346af9c1 100644
--- a/xformers/benchmarks/benchmark_transformer.py
+++ b/xformers/benchmarks/benchmark_transformer.py
@@ -152,6 +152,7 @@ def benchmark_transformer(model_info, dtype) -> Iterator[benchmark.Timer]:
             sub_label=model_name,
         )
 
+
 if torch.version.hip:
     print("This benchmark could not be done on ROCM!")
 else:
diff --git a/xformers/benchmarks/utils.py b/xformers/benchmarks/utils.py
index 31c6eb688..ef508661a 100644
--- a/xformers/benchmarks/utils.py
+++ b/xformers/benchmarks/utils.py
@@ -263,9 +263,9 @@ def _benchmark_results_from_csv(filename: str) -> List[Tuple[Dict[str, Any], Any
             data.append(
                 (
                     {
-                        META_ALGORITHM: row["algorithm"]
-                        if row["algorithm"] != ""
-                        else None,
+                        META_ALGORITHM: (
+                            row["algorithm"] if row["algorithm"] != "" else None
+                        ),
                     },
                     measurement,
                 )
@@ -282,9 +282,11 @@ def _benchmark_results_to_csv(
             "label": r.task_spec.label,
             "num_threads": r.task_spec.num_threads,
             "algorithm": metadata.get(META_ALGORITHM, ""),
-            "description": r.task_spec.description
-            if r.task_spec.description in BASELINE_DESCRIPTIONS
-            else "",
+            "description": (
+                r.task_spec.description
+                if r.task_spec.description in BASELINE_DESCRIPTIONS
+                else ""
+            ),
             "runtime_us": int(1000 * 1000 * r.mean),
             "mem_use_mb": r.mem_use,
         }
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 625caa7e6..f43cb7905 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -42,22 +42,18 @@ def _minimum_gemm_alignment(inp: Inputs) -> int:
 
 def _get_seqlen_info(
     inp: Inputs,
-) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int, int]:
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int]:
     attn_bias = inp.attn_bias
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
-        # attn_bias.k_seqinfo.to(inp.query.device)
-        # attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
-        # max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
     else:
         seqstart_k = None
         seqstart_q = None
         max_seqlen_q = -1
-        # max_seqlen_k = -1
 
     return (
         seqstart_k,

From 0c51af1953dcdd99763223cf838e2ea7c82b50bf Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 16:19:58 +0000
Subject: [PATCH 438/641] Change and add readme for tests and benchmarks

---
 tests/readme_test_on_rocm.txt                 | 35 ++++---------------
 .../benchmarks/readme_benchmark_on_rocm.txt   | 17 +++++++++
 2 files changed, 23 insertions(+), 29 deletions(-)
 create mode 100644 xformers/benchmarks/readme_benchmark_on_rocm.txt

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index 129bf3df0..c21fd0d58 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -1,36 +1,13 @@
 
-   1. pip install -e ./
+   1. #> pip install -e ./
 
-   2. verify testing for memory_efficient_attention inference 
+   2. verify testing for generic fmha inference on ROCM
 
-      pytest tests/test_mem_eff_attention_ck.py::test_forward
-      pytest tests/test_mem_eff_attention.py::test_forward -k ckF 
+      #> pytest tests/test_mem_eff_attention.py::test_forward  
 
-   3. The following tests in tests/memory_eff_attention_ck.py have passed
+   3. verify testing for decoder fmha inference on ROCM
 
-      * test_forward
-      * test_key_query_all_ones
-      * test_logsumexp
-      * test_attn_bias 
-        - test_attn_bias_causal
-        - test_attn_bias_torch_tensor 
-        - test_attn_bias_blockdiag
-        - test_attn_bias_blockdiag_batched
-        - test_attn_bias_blockdiag_crossattn_causal
-        - test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond
-        - test_attn_bias_blockdiag_crossattn_causal_with_prefix()
-        - test_attn_bias_padded
-        - test_attn_bias_from_seqlens
-        - test_attn_bias_blockdiag_doc
-      * test_unsupported_cpu
-      * test_unsupported_stride_lastdim
-      * test_unsupported_stride_alignment
-      * test_cuda_streams
-      * test_dropout
-      * test_backward
-      * test_decoder
+      #> pytest tests/test_mem_eff_attention.py::test_decoder
+      #> pytest tests/test_mem_eff_attention.py::test_splitk_decoder
 
-   4. verify testing for memory_efficient_attention forward (with dropout)
-
-      pytest tests/test_mem_eff_attention_ck.py::test_dropout
 
diff --git a/xformers/benchmarks/readme_benchmark_on_rocm.txt b/xformers/benchmarks/readme_benchmark_on_rocm.txt
new file mode 100644
index 000000000..9ae61f529
--- /dev/null
+++ b/xformers/benchmarks/readme_benchmark_on_rocm.txt
@@ -0,0 +1,17 @@
+
+
+    1. #> pip install -e ./
+
+    2. Benchmark for generic fmha inference on ROCM
+
+       #> python xformers/benchmarks/benchmark_mem_eff_attention.py
+
+    3. Benchmark for decoder fmha inference on ROCM
+
+       #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py  
+
+    4. Other Benchmarks for fmha inference on ROCM
+
+       #> python xformers/benchmarks/benchmark_attn_decoding.py
+       #> python xformers/benchmarks/benchmark_mem_eff_attention_mqa.py
+

From f36c93be9d7d61346e331b9e63d3ee8dfa35c36c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 17:33:04 +0000
Subject: [PATCH 439/641] Remove the stuffs for supporting old ck

---
 setup.py                                      |  198 +-
 tests/test_checkpoint.py                      |    2 +-
 tests/test_mem_eff_attention.py               |   14 +-
 tests/test_mem_eff_attention_ck_discarded.py  | 2466 -----------------
 tests/test_mqa_forward_ck_tiled_discarded.py  |  212 --
 .../hip_fmha/attention_backward_generic.cpp   |  573 ----
 .../hip_fmha/attention_ck_rand_uniform.cpp    |  125 -
 .../hip_fmha/attention_forward_generic.cpp    |  425 ---
 .../csrc/attention/hip_fmha/ck_align_switch.h |  151 -
 .../csrc/attention/hip_fmha/ck_bool_switch.h  |   29 -
 .../ck_fmha_backward_gemm_constants.h         |  196 --
 .../hip_fmha/ck_fmha_batched_backward.h       |  525 ----
 .../ck_fmha_batched_backward_bp16.cpp         |  113 -
 .../ck_fmha_batched_backward_fp16.cpp         |  113 -
 .../hip_fmha/ck_fmha_batched_forward.h        |  379 ---
 .../hip_fmha/ck_fmha_batched_forward_bp16.cpp |   63 -
 .../hip_fmha/ck_fmha_batched_forward_fp16.cpp |   63 -
 .../hip_fmha/ck_fmha_batched_infer.h          |  359 ---
 .../hip_fmha/ck_fmha_batched_infer_bp16.cpp   |   63 -
 .../hip_fmha/ck_fmha_batched_infer_fp16.cpp   |   63 -
 .../hip_fmha/ck_fmha_common_gemm_constants.h  |   28 -
 .../hip_fmha/ck_fmha_forward_gemm_constants.h |  110 -
 .../hip_fmha/ck_fmha_grouped_backward.h       |  525 ----
 .../ck_fmha_grouped_backward_bp16.cpp         |  113 -
 .../ck_fmha_grouped_backward_fp16.cpp         |  113 -
 .../hip_fmha/ck_fmha_grouped_forward.h        |  375 ---
 .../hip_fmha/ck_fmha_grouped_forward_bp16.cpp |   63 -
 .../hip_fmha/ck_fmha_grouped_forward_fp16.cpp |   63 -
 .../hip_fmha/ck_fmha_grouped_infer.h          |  359 ---
 .../hip_fmha/ck_fmha_grouped_infer_bp16.cpp   |   63 -
 .../hip_fmha/ck_fmha_grouped_infer_fp16.cpp   |   63 -
 .../hip_fmha/ck_fmha_infer_gemm_constants.h   |  106 -
 .../attention/hip_fmha/ck_fmha_op_helper.h    |   49 -
 .../csrc/attention/hip_fmha/ck_fmha_params.h  |  212 --
 .../csrc/attention/hip_fmha/ck_fmha_test.cpp  |   14 -
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |   14 -
 ..._bp16_masktype_0_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_bp16_masktype_0_with_attnbias.cpp |   14 -
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |   14 -
 ..._bp16_masktype_1_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_bp16_masktype_1_with_attnbias.cpp |   14 -
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |   14 -
 ..._bp16_masktype_2_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_bp16_masktype_2_with_attnbias.cpp |   14 -
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |   14 -
 ..._fp16_masktype_0_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_fp16_masktype_0_with_attnbias.cpp |   14 -
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |   14 -
 ..._fp16_masktype_1_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_fp16_masktype_1_with_attnbias.cpp |   14 -
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |   16 -
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |   14 -
 ..._fp16_masktype_2_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_fp16_masktype_2_with_attnbias.cpp |   14 -
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |   14 -
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |   13 -
 ..._forward_bp16_masktype_0_with_attnbias.cpp |   13 -
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |   13 -
 ..._forward_bp16_masktype_1_with_attnbias.cpp |   13 -
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |   13 -
 ..._forward_bp16_masktype_2_with_attnbias.cpp |   13 -
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |   13 -
 ..._forward_fp16_masktype_0_with_attnbias.cpp |   13 -
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |   13 -
 ..._forward_fp16_masktype_1_with_attnbias.cpp |   13 -
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |   13 -
 ..._forward_fp16_masktype_2_with_attnbias.cpp |   13 -
 ...ched_infer_bp16_masktype_0_no_attnbias.cpp |   14 -
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |   14 -
 ...ched_infer_bp16_masktype_1_no_attnbias.cpp |   14 -
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |   14 -
 ...ched_infer_bp16_masktype_2_no_attnbias.cpp |   14 -
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |   14 -
 ...ched_infer_fp16_masktype_0_no_attnbias.cpp |   14 -
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   14 -
 ...ched_infer_fp16_masktype_1_no_attnbias.cpp |   14 -
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   14 -
 ...ched_infer_fp16_masktype_2_no_attnbias.cpp |   14 -
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   14 -
 ...d_backward_bp16_masktype_0_no_attnbias.cpp |   14 -
 ..._bp16_masktype_0_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_bp16_masktype_0_with_attnbias.cpp |   14 -
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_bp16_masktype_1_no_attnbias.cpp |   14 -
 ..._bp16_masktype_1_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_bp16_masktype_1_with_attnbias.cpp |   14 -
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_bp16_masktype_2_no_attnbias.cpp |   14 -
 ..._bp16_masktype_2_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_bp16_masktype_2_with_attnbias.cpp |   14 -
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_fp16_masktype_0_no_attnbias.cpp |   14 -
 ..._fp16_masktype_0_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_fp16_masktype_0_with_attnbias.cpp |   14 -
 ...p16_masktype_0_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_fp16_masktype_1_no_attnbias.cpp |   14 -
 ..._fp16_masktype_1_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_fp16_masktype_1_with_attnbias.cpp |   14 -
 ...p16_masktype_1_with_attnbias_fp32_grad.cpp |   14 -
 ...d_backward_fp16_masktype_2_no_attnbias.cpp |   14 -
 ..._fp16_masktype_2_no_attnbias_fp32_grad.cpp |   14 -
 ...backward_fp16_masktype_2_with_attnbias.cpp |   14 -
 ...p16_masktype_2_with_attnbias_fp32_grad.cpp |   14 -
 ...ed_forward_bp16_masktype_0_no_attnbias.cpp |   13 -
 ..._forward_bp16_masktype_0_with_attnbias.cpp |   13 -
 ...ed_forward_bp16_masktype_1_no_attnbias.cpp |   13 -
 ..._forward_bp16_masktype_1_with_attnbias.cpp |   13 -
 ...ed_forward_bp16_masktype_2_no_attnbias.cpp |   13 -
 ..._forward_bp16_masktype_2_with_attnbias.cpp |   13 -
 ...ed_forward_fp16_masktype_0_no_attnbias.cpp |   13 -
 ..._forward_fp16_masktype_0_with_attnbias.cpp |   13 -
 ...ed_forward_fp16_masktype_1_no_attnbias.cpp |   13 -
 ..._forward_fp16_masktype_1_with_attnbias.cpp |   13 -
 ...ed_forward_fp16_masktype_2_no_attnbias.cpp |   13 -
 ..._forward_fp16_masktype_2_with_attnbias.cpp |   13 -
 ...uped_infer_bp16_masktype_0_no_attnbias.cpp |   14 -
 ...ed_infer_bp16_masktype_0_with_attnbias.cpp |   14 -
 ...uped_infer_bp16_masktype_1_no_attnbias.cpp |   14 -
 ...ed_infer_bp16_masktype_1_with_attnbias.cpp |   14 -
 ...uped_infer_bp16_masktype_2_no_attnbias.cpp |   14 -
 ...ed_infer_bp16_masktype_2_with_attnbias.cpp |   14 -
 ...uped_infer_fp16_masktype_0_no_attnbias.cpp |   14 -
 ...ed_infer_fp16_masktype_0_with_attnbias.cpp |   14 -
 ...uped_infer_fp16_masktype_1_no_attnbias.cpp |   14 -
 ...ed_infer_fp16_masktype_1_with_attnbias.cpp |   14 -
 ...uped_infer_fp16_masktype_2_no_attnbias.cpp |   14 -
 ...ed_infer_fp16_masktype_2_with_attnbias.cpp |   14 -
 xformers/ops/fmha/ck.py                       |  115 +-
 132 files changed, 112 insertions(+), 9713 deletions(-)
 delete mode 100644 tests/test_mem_eff_attention_ck_discarded.py
 delete mode 100644 tests/test_mqa_forward_ck_tiled_discarded.py
 delete mode 100644 xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_align_switch.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_bool_switch.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_fmha_params.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
 delete mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp

diff --git a/setup.py b/setup.py
index 14462cf74..312bf4d2d 100644
--- a/setup.py
+++ b/setup.py
@@ -278,132 +278,61 @@ def get_extensions():
         ),
     ]
 
-    if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "attention_forward_generic.cpp"
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "attention_backward_generic.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "attention_ck_rand_uniform.cpp"
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_infer_*.cpp"
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_infer_*.cpp"
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "ck_fmha_batched_forward_*.cpp"
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "ck_fmha_grouped_forward_*.cpp"
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "ck_fmha_batched_backward_*.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "ck_fmha_grouped_backward_*.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "instances", "ck_fmha_*.cpp"
-            ),
-            recursive=False,
-        )
-    else:
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "attention_forward_generic_ck_tiled.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "ck_tiled_fmha_batched_infer_*.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "ck_tiled_fmha_grouped_infer_*.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "ck_tiled_fmha_batched_forward_*.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "ck_tiled_fmha_grouped_forward_*.cpp",
-            ),
-            recursive=False,
-        )
-        source_hip += glob.glob(
-            os.path.join(
-                extensions_dir,
-                "attention",
-                "hip_fmha",
-                "instances_tiled",
-                "ck_tiled_fmha_*.cpp",
-            ),
-            recursive=False,
-        )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir,
+            "attention",
+            "hip_fmha",
+            "attention_forward_generic_ck_tiled.cpp",
+        ),
+        recursive=False,
+    )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir,
+            "attention",
+            "hip_fmha",
+            "ck_tiled_fmha_batched_infer_*.cpp",
+        ),
+        recursive=False,
+    )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir,
+            "attention",
+            "hip_fmha",
+            "ck_tiled_fmha_grouped_infer_*.cpp",
+        ),
+        recursive=False,
+    )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir,
+            "attention",
+            "hip_fmha",
+            "ck_tiled_fmha_batched_forward_*.cpp",
+        ),
+        recursive=False,
+    )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir,
+            "attention",
+            "hip_fmha",
+            "ck_tiled_fmha_grouped_forward_*.cpp",
+        ),
+        recursive=False,
+    )
+    source_hip += glob.glob(
+        os.path.join(
+            extensions_dir,
+            "attention",
+            "hip_fmha",
+            "instances_tiled",
+            "ck_tiled_fmha_*.cpp",
+        ),
+        recursive=False,
+    )
 
     source_hip += source_hip_decoder
 
@@ -497,19 +426,12 @@ def get_extensions():
             Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
         ]
 
-        if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
-            include_dirs += [
-                Path(this_dir) / "third_party" / "composable_kernel" / "include"
-            ]
-        else:
-            include_dirs += [
-                Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
-            ]
+        include_dirs += [
+            Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
+        ]
+
+        generator_flag = []
 
-        if os.getenv("FORCE_OLD_CK_KERNEL", "0") == "1":
-            generator_flag = []
-        else:
-            generator_flag = ["-DUSE_CK_TILED_KERNEL"]
         cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
         extra_compile_args = {
             "cxx": ["-O3", "-std=c++17"] + generator_flag,
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 722a3eb8c..d01abee67 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -126,7 +126,7 @@ def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast,
     ):
         pytest.skip("FlashAttentionOp is not supported on ROCM!")
 
-    if op is xformers.ops.MemoryEfficientAttentionCkOp and op[0].IS_CK_TILED:
+    if op is xformers.ops.MemoryEfficientAttentionCkOp:
         pytest.skip("Gradience is currently not supported by ck-tiled!")
 
     class Attn(nn.Module):
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 4a460ca3c..72d7db48a 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -745,7 +745,7 @@ def test_mqa_forward(
 
     device = torch.device("cuda")
 
-    if op is fmha.ck.FwOp and not op.IS_CK_TILED:
+    if op is fmha.ck.FwOp:
         pytest.skip("mqa/gqa is only supported with ck-tiled fmha")
 
     torch.manual_seed(B * M + N * K + Hq * Hkv + Kv)
@@ -845,7 +845,7 @@ def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if op is fmha.ck.FwOp and op.IS_CK_TILED:
+    if op is fmha.ck.FwOp:
         pytest.skip("logsumexp is not yet supported by ck-tiled fmha!")
 
     if op is fmha.triton_splitk.FwOp and (
@@ -1500,7 +1500,7 @@ def test_grad_checkpointing(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     if op is fmha.triton.FwOp:
         pytest.skip("Triton Flash Attention 2 doesn't support backward pass yet")
-    if op is fmha.ck.FwOp and op.IS_CK_TILED:
+    if op is fmha.ck.FwOp:
         pytest.skip("ck-tiled FMHA doesn't supported backward pass yet")
     if op is fmha.triton_splitk.FwOp and (
         sys.version_info.major,
@@ -2119,7 +2119,7 @@ def test_attn_bias_blockdiag_doc() -> None:
 
     from xformers.ops import fmha
 
-    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
+    if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
     K = 16
@@ -2567,7 +2567,7 @@ def test_empty_tensors_empty_query(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
+    if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
     if opFW is fmha.triton_splitk.FwOp and (
@@ -2598,7 +2598,7 @@ def test_empty_tensors_empty_kv(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
+    if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
     if opFW is fmha.triton_splitk.FwOp and (
@@ -2629,7 +2629,7 @@ def test_empty_tensors_empty_b(
     )
     opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
 
-    if torch.version.hip and fmha.ck.FwOp.IS_CK_TILED:
+    if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
     if opFW is fmha.triton_splitk.FwOp and (
diff --git a/tests/test_mem_eff_attention_ck_discarded.py b/tests/test_mem_eff_attention_ck_discarded.py
deleted file mode 100644
index 2879e6946..000000000
--- a/tests/test_mem_eff_attention_ck_discarded.py
+++ /dev/null
@@ -1,2466 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import random
-from functools import partial
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
-
-import pytest
-import torch
-import torch.nn.functional as F
-from torch.utils.checkpoint import checkpoint
-
-import xformers.ops
-from xformers.attn_bias_utils import create_attn_bias
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-from xformers.ops.fmha.dispatch import _dispatch_fw_priority_list
-
-from .utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-_types = [torch.float16, torch.bfloat16]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
-    fmha.ck.BwOp,
-]
-
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    for B in op._TEST_BATCH_SIZES:
-        for Mq in [32, 256]:
-            for Mkv in [32, 64, 256, 1024]:
-                for K in op._TEST_K:
-                    shapes.append((B, Mq, Mkv, 1, K, K))
-        Mq = 256
-        Mkv = 128
-        K = 32
-        H = 1
-        # Weird values of parameters
-        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
-            shapes.append((B, M, Mkv, H, K, K))
-            shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
-            if _K <= op.SUPPORTED_MAX_K:
-                shapes.append((B, Mq, Mkv, H, _K, _K))
-        # Different value for K / Kv
-        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-            for _K in [32, 36, 64, 256 + 8]:
-                shapes.append((B, Mq, Mkv, H, K, _K))
-                shapes.append((B, Mq, Mkv, H, _K, K))
-        # Exotic sizes
-        for K in op._TEST_K:
-            shapes.append((B, 16, 1024, H, K, K))
-            shapes.append((B, 1024, 16, H, K, K))
-        # Some number of heads
-        for H in [3, 5, 12]:
-            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
-    # Filter-out not supported shapes
-    shapes = [
-        shape
-        for shape in shapes
-        if len(
-            op.shape_not_supported_reasons(
-                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
-            )
-        )
-        == 0
-    ]
-    # Add some random shapes
-    if op in [
-        fmha.cutlass.FwOp,
-        fmha.cutlass.BwOp,
-        fmha.flash.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        found_count = 0
-        while found_count < 200:
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
-                continue
-            found_count += 1
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def make_id(op, device, dtype, bias_type, *shape):
-    return (
-        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-        f"-{'-'.join([str(s) for s in shape])}"
-    )
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if bias_type in {
-                            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-                            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
-                        }:
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-    return {
-        "argvalues": combination,
-        "ids": [make_id(*c) for c in combination],
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
-)
-
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 5:
-
-        def attn_bias_group(group: int):
-            if isinstance(attn_bias, torch.Tensor):
-                return attn_bias[:, group]
-            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    attn_bias._bias[:, group]
-                )
-            return attn_bias
-
-        return torch.stack(
-            [
-                ref_attention_bmhk(
-                    q[:, :, g],
-                    k[:, :, g],
-                    v[:, :, g],
-                    scale=scale,
-                    attn_bias=attn_bias_group(g),
-                )
-                for g in range(q.shape[2])
-            ],
-            dim=2,
-        )
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, scale=scale, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def ref_attention_splitk_bmhk(
-    q, k, v, attn_bias, scale=None, split_k=None, dtype=None
-) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention_splitk(
-        T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype
-    )
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def ref_attention_splitk(
-    q, k, v, attn_bias, scale=None, split_k=2, dtype=None
-) -> torch.Tensor:
-    if q.ndim == 5:
-
-        def attn_bias_group(group: int):
-            if isinstance(attn_bias, torch.Tensor):
-                return attn_bias[:, group]
-            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    attn_bias._bias[:, group]
-                )
-            return attn_bias
-
-        return torch.stack(
-            [
-                ref_attention_splitk_bmhk(
-                    q[:, :, g],
-                    k[:, :, g],
-                    v[:, :, g],
-                    attn_bias=attn_bias_group(g),
-                    split_k=split_k,
-                    dtype=dtype,
-                )
-                for g in range(q.shape[2])
-            ],
-            dim=2,
-        )
-
-    if q.ndim == 4:
-        return ref_attention_splitk_bmhk(
-            q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype
-        )
-    assert q.ndim == 3
-    if dtype is None:
-        dtype = torch.float32
-    q = q.to(dtype=dtype)
-    k = k.to(dtype=dtype)
-    v = v.to(dtype=dtype)
-
-    if scale is None:
-        scale = q.shape[-1] ** -0.5
-    assert not q.isnan().any()
-    q = q * scale
-    assert not q.isnan().any()
-
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-
-    split_size = k.size(-2) // split_k
-    split_config = {"dim": -2, "split_size_or_sections": split_size}
-    k_split = torch.split(k, **split_config)
-    v_split = torch.split(v, **split_config)
-    attn_bias_split = torch.split(
-        attn_bias_tensor, dim=-1, split_size_or_sections=split_size
-    )
-
-    def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
-        p_slice = q_whole @ k_slice.transpose(-2, -1)
-        p_slice += attn_bias_slice
-        m = torch.max(p_slice, dim=-1, keepdim=True).values
-        p_slice_scaled = p_slice - m
-        p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
-        s = torch.exp(p_slice_scaled)
-        l1 = torch.sum(s, dim=-1, keepdim=True)
-        attn_slice = s @ v_slice
-        return {
-            "attn_slice": attn_slice,
-            "row_max": m,
-            "row_lse": l1,
-        }
-
-    splits = list(zip(k_split, v_split, attn_bias_split))
-
-    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]), splits))
-    out = torch.zeros_like(q)
-
-    # reduce out over split-k slices
-
-    global_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
-    global_sumexp = torch.zeros_like(slices[0]["row_lse"])
-
-    for s in slices:
-        local_out = s["attn_slice"]
-        local_max = s["row_max"]
-        local_sumexp = s["row_lse"]
-
-        log_alpha = -torch.abs(local_max - global_max)
-        alpha = torch.exp(log_alpha)
-        alpha.nan_to_num_(1.0)
-
-        pick_new = local_max < global_max
-        new_coef = torch.where(pick_new, alpha, 1.0)
-        curr_coef = torch.where(pick_new, 1.0, alpha)
-
-        out = out * curr_coef + local_out * new_coef
-        global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
-        global_max = torch.max(local_max, global_max)
-    out /= global_sumexp
-    return out
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
-    tensor_with_grad: Optional[torch.Tensor] = None
-    if isinstance(attn_bias, torch.Tensor):
-        tensor_with_grad = attn_bias
-    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-        tensor_with_grad = attn_bias._bias
-    if tensor_with_grad is not None:
-        grad = tensor_with_grad.grad
-        if clear:
-            tensor_with_grad.grad = None
-        return grad
-    return None
-
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-    g: int = 1,
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-
-    mask_is_bottom_right = attn_bias_type is not None and issubclass(
-        attn_bias_type,
-        (
-            fmha.attn_bias.LowerTriangularFromBottomRightMask,
-            fmha.attn_bias.LowerTriangularFromBottomRightLocalAttentionMask,
-            fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            fmha.attn_bias.BlockDiagonalCausalLocalAttentionFromBottomRightMask,
-            fmha.attn_bias.BlockDiagonalCausalLocalAttentionMask,
-            fmha.attn_bias.LocalAttentionFromBottomRightMask,
-        ),
-    )
-    if mask_is_bottom_right and q_len > kv_len:
-        # Bottom-right attention and local-attention masks require q_len <= kv_len
-        kv_len = q_len
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype)
-    elif fmt == "BMHK":
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype)
-    else:
-        assert fmt == "BMGHK"
-        query = torch.randn((B, q_len, g, h, k), device=device, dtype=dtype)
-        key = torch.randn((B, kv_len, g, 1, k), device=device, dtype=dtype)
-        value = torch.randn((B, kv_len, g, 1, kv), device=device, dtype=dtype)
-
-    for x in [query, key, value]:
-        x.mul_(scale)
-
-    if fmt == "BMGHK":
-        # Expand - after the in-place mul
-        key = key.expand((B, kv_len, g, h, k))
-        value = value.expand((B, kv_len, g, h, k))
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            num_heads_groups=g,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("packed", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK" if packed else fmt,
-        **kwargs,
-    )
-
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                num_heads_groups=1,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        elif fmt == "BMHK":
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-        else:
-            assert False, f"Unsupport fmt {fmt} with packing"
-        assert not query.is_contiguous()
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("k_len", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 4])
-@pytest.mark.parametrize("kv_len", [128, 512])
-@pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("dtype", _types)
-def test_key_query_all_ones(dtype, q_len, kv_len, batch_size, k_len):
-    device = "cuda"
-    scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
-    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, op=(fmha.ck.FwOp, None)
-    )
-    # this should be equivalent to the average over value
-    ref = value.mean(1, keepdim=True).expand_as(query)
-
-    if dtype is torch.float16:
-        assert_allclose(out, ref, atol=1e-5)
-    else:
-        assert_allclose(out, ref, atol=1e-2)
-
-
-def _block_diag_reshape_lse(
-    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
-) -> torch.Tensor:
-    """LSE can be padded, let's remove the padding"""
-    parts = []
-    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
-        parts.append(slice[:, : end - start])
-    return torch.cat(parts, dim=1).unsqueeze(1)
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
-    )
-
-    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query,
-        key,
-        value,
-        op=op,
-        attn_bias=attn_bias,
-    )
-    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            tensor_bias = attn_bias.materialize(
-                (query.shape[0], 1, query.shape[1], key.shape[1]),
-                device=query.device,
-                dtype=torch.float32,
-            )
-        else:
-            assert isinstance(attn_bias, torch.Tensor)
-            tensor_bias = attn_bias
-        if tensor_bias.ndim == 4:
-            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
-        attn = attn + tensor_bias.float()
-    ref_lse = attn.logsumexp(-1)
-    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
-        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
-    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
-
-
-@cuda_only
-@pytest.mark.parametrize("op", [fmha.cutlass.FwOp, fmha.flash.FwOp])
-def test_logsumexp_mqa(op):
-    if not op.is_available():
-        pytest.skip("not available")
-
-    dtype = torch.float16
-    s = 3
-    query = torch.randn([1, 1, 32, 128], dtype=dtype, device="cuda") * s
-    key = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
-        -1, -1, 32, -1
-    )
-    value = (torch.randn([1, 16, 1, 128], dtype=dtype, device="cuda") * s).expand(
-        -1, -1, 32, -1
-    )
-    assert key.stride(2) == 0
-
-    _, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query,
-        key,
-        value,
-        op=op,
-    )
-    query, key, value = [x[0].transpose(0, 1) for x in [query, key, value]]
-    attn = (query.float() / query.shape[-1] ** 0.5) @ key.float().transpose(-2, -1)
-    ref_lse = attn.logsumexp(-1)
-    assert_allclose(lse[0, :, 0], ref_lse[:, 0], atol=2e-4)
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("grad_out_contiguous", [False, True])
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_backward(
-    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    grad_out_contiguous,
-    fmt,
-):
-    (
-        op_bw,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    # ToDo: reopen bfloat16 for testing
-    if dtype is torch.bfloat16:
-        pytest.skip(
-            "Temporarily disabled bfloat16 as we are still improving the accuracy of the results"
-        )
-
-    if k > 128 or kv > 128:
-        pytest.skip(
-            "head-dim length bigger than 128 is not supported by CK-FlashAttention"
-        )
-
-    if k % 2 != 0:
-        pytest.skip("head-dim length must be an even value for CK-FlashAttention")
-
-    if grad_out_contiguous is False:
-        pytest.skip(
-            "CK-FlashAttention requires grad_out and out have same lengths/strides"
-        )
-
-    attn_bias_requires_grad = (
-        random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
-    )
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        attn_bias_requires_grad=attn_bias_requires_grad,
-        fmt=fmt,
-    )
-
-    # To understand why we do this, check the comment on the
-    # `AttentionBwOpBase` class
-    scale = None
-    if op_bw.SUPPORTS_CUSTOM_SCALE and query.shape[-1] < 32:
-        scale = (1 / 32) ** 0.5
-    op_fw = (
-        sample_random_supported_fw(
-            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
-            seed=q_len * kv + kv_len * k,
-        )
-        if op_bw != fmha.ck.BwOp
-        else fmha.ck.FwOp
-    )
-    qkv = None
-
-    if (
-        fmt == "BMHK"
-        and query.shape[3] == value.shape[3]
-        and query.shape[1] == value.shape[1]
-    ):
-        qkv = torch.stack([query, key, value], 2)
-        qkv.requires_grad_(True)
-        # bm3hk -> 3 x bmhk
-        query, key, value = xformers.ops.unbind(qkv, 2)
-        assert not query.is_contiguous()
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
-        pytest.skip("inputs not supported")
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, scale=scale, op=(op_fw, op_bw)
-    )
-
-    grad_out = torch.randn_like(out)
-    if grad_out_contiguous is False:
-        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
-            None, None, :
-        ].expand_as(out)
-
-    out.backward(grad_out)
-
-    if qkv is None and op_bw == fmha.cutlass.BwOp:
-        assert query.stride() == query.grad.stride()
-
-    grads = []
-    if qkv is None:
-        grads = [query.grad, key.grad, value.grad]
-        query.grad = None
-        key.grad = None
-        value.grad = None
-    else:
-        grads = [qkv.grad]
-        qkv.grad = None
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
-        if attn_bias_grad is not None:
-            grads.append(attn_bias_grad)
-
-    ref = ref_attention(query, key, value, attn_bias, scale=scale)
-    ref.backward(grad_out)
-
-    assert_allclose(
-        out.float(),
-        ref.float(),
-        "fw pass",
-        atol=op_fw.ERROR_ATOL[dtype],
-        rtol=op_fw.ERROR_RTOL[dtype],
-    )
-
-    del out
-    del grad_out
-    del ref
-
-    atol = op_bw.ERROR_ATOL[dtype]
-    rtol = op_bw.ERROR_RTOL[dtype]
-
-    grads_ref = []
-    grads_name = []
-    if qkv is None:
-        assert isinstance(query.grad, torch.Tensor)
-        assert isinstance(key.grad, torch.Tensor)
-        assert isinstance(value.grad, torch.Tensor)
-        grads_ref = [query.grad, key.grad, value.grad]
-        grads_name = ["query", "key", "value"]
-    else:
-        assert isinstance(qkv.grad, torch.Tensor)
-        grads_ref = [qkv.grad]
-        grads_name = ["qkv"]
-
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias)
-        if attn_bias_grad is not None:
-            grads_ref.append(attn_bias.grad)
-            grads_name.append("bias")
-
-    del query
-    del key
-    del value
-    del qkv
-
-    assert len(grads_ref) == len(
-        grads
-    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
-    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
-        assert_allclose(
-            calc_grad,
-            ref_grad,
-            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
-            atol=atol,
-            rtol=rtol,
-        )
-
-
-def _vec_binom_test(x, n, p):
-    """
-    vectorized implementation of scipy.stats.binom_test
-    this makes our tests much faster
-    reference: https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_morestats.py#L2609-L2702
-    """
-    import numpy as np
-    from scipy.stats import distributions
-
-    x = np.atleast_1d(x)
-    d = distributions.binom.pmf(x, n, p)[:, None]
-    rerr = 1 + 1e-7
-    # x < p * n case
-    i = np.arange(np.ceil(p * n), n + 1)
-    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
-    pval1 = distributions.binom.cdf(x, n, p) + distributions.binom.sf(n - y, n, p)
-
-    # other case
-    i = np.arange(np.floor(p * n) + 1)
-    y = np.sum(distributions.binom.pmf(i, n, p) <= d * rerr, axis=1)
-    pval2 = distributions.binom.cdf(y - 1, n, p) + distributions.binom.sf(x - 1, n, p)
-
-    pval = np.where(x < p * n, pval1, pval2)
-    pval = np.minimum(1.0, pval)
-    return pval
-
-
-def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
-    if op == fmha.ck.FwOp:
-        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
-        # rand_uniform is an int32 tensor
-        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
-        # mask = (rand_uniform <= int((1.0-p)*65535.0)).to(torch.float32)
-        mask = (rand_uniform <= int((1.0 - p) * 255.0)).to(torch.float32)
-        mask = mask.reshape(batch_size, q_len, kv_len)
-    else:
-        mask = torch.empty((batch_size, q_len, kv_len), device=device)
-        mask = torch.ops.xformers._temp_dropout(mask, p)
-
-    return mask
-
-
-@cuda_only
-@pytest.mark.parametrize("attn_bias", [None, fmha.attn_bias.LowerTriangularMask()])
-@pytest.mark.parametrize("seed", [42, 124])
-@pytest.mark.parametrize("p", [0.3, 0.7])
-@pytest.mark.parametrize("k_len", [32])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 15, 32, 33, 65])
-@pytest.mark.parametrize("q_len", [2, 33])
-@pytest.mark.parametrize("op", ALL_FW_OPS, ids=list(map(lambda t: t.NAME, ALL_FW_OPS)))
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
-def test_dropout(dtype, op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
-    from scipy.stats import binomtest
-
-    device = "cuda"
-    scale = 0.05
-    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
-    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-
-    inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
-    if not op.supports(inputs_for_support_check):
-        del query, key, value, attn_bias
-        pytest.skip(f"{op.NAME}: unsupported input")
-
-    torch.manual_seed(seed)
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, p, op=(op, None)
-    )
-
-    torch.manual_seed(seed)
-    out2 = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, p, op=(op, None)
-    )
-
-    assert_allclose(out, out2, "dropout reproducibility")
-
-    torch.manual_seed(seed)
-    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
-    ref = ref_attention(query, key, value, attn_bias, mask, p)
-    assert_allclose(
-        out.float(), ref, atol=3e-3, rtol=5e-4
-    ), f"{(out - ref).abs().max()}"
-
-    num_trials = 1000
-    p_val_tol = 1e-6
-    keep_prob = 1 - p
-    masks = []
-    for i in range(num_trials):
-        mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
-        masks.append(mask.clone().cpu())
-    masks = torch.stack(masks, dim=0)
-    p_value = binomtest(int(masks.sum()), masks.numel(), p=keep_prob).pvalue
-    assert p_value > p_val_tol, p_value
-    masks = masks.sum(0).flatten()
-    p_values = _vec_binom_test(masks, num_trials, p=keep_prob)
-    assert all(p_values > p_val_tol)
-
-
-def _test_dropout_backward(q_len, kv_len, batch_size, k, p, op, dtype):
-    if not op.is_available():
-        pytest.skip()
-
-    scale = 3
-    device = "cuda"
-    query = torch.randn((batch_size, q_len, k), device=device, dtype=dtype) * scale
-    key = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
-    value = torch.randn((batch_size, kv_len, k), device=device, dtype=dtype) * scale
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    grad_out = torch.ones_like(query)
-
-    assert op.supports(fmha.Inputs(query=query, key=key, value=value, p=p))
-
-    seed = 42
-    torch.manual_seed(seed)
-    out = xformers.ops.memory_efficient_attention(query, key, value, p=p, op=(op, None))
-
-    out.backward(grad_out)
-
-    grad_q = query.grad
-    grad_k = key.grad
-    grad_v = value.grad
-
-    query.grad = None
-    key.grad = None
-    value.grad = None
-
-    torch.manual_seed(seed)
-    mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
-
-    ref = ref_attention(query, key, value, None, mask, p)
-    ref.backward(grad_out)
-
-    atol, rtol = (
-        fmha.AttentionBwOpBase.ERROR_ATOL[dtype],
-        fmha.AttentionBwOpBase.ERROR_RTOL[dtype],
-    )
-    assert_allclose(
-        grad_v,
-        value.grad,
-        "grad_v",
-        atol=atol,
-        rtol=rtol,
-    )
-    # TODO: Investigate why precision is worse
-    if dtype in [torch.float16, torch.bfloat16]:
-        atol = atol * 2 + 0.15
-        rtol = rtol * 2
-    assert_allclose(
-        grad_q,
-        query.grad,
-        "grad_q",
-        atol=atol,
-        rtol=rtol,
-    )
-    assert_allclose(
-        grad_k,
-        key.grad,
-        "grad_k",
-        atol=atol,
-        rtol=rtol,
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("p", [0.3, 0.7])
-@pytest.mark.parametrize("k", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
-@pytest.mark.parametrize("q_len", [2, 33])
-def test_dropout_backward_small_k(q_len, kv_len, batch_size, k, p):
-    _test_dropout_backward(
-        q_len, kv_len, batch_size, k, p, op=fmha.small_k.FwOp, dtype=torch.float32
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
-@pytest.mark.parametrize("k", [16, 128, 256])
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("kv_len", [3, 248, 256])
-@pytest.mark.parametrize("q_len", [3, 248, 256])
-@pytest.mark.parametrize("dt", ["f16", "bf16", "f32"])
-def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
-    _test_dropout_backward(
-        q_len,
-        kv_len,
-        batch_size,
-        k,
-        p,
-        op=fmha.cutlass.FwOp,
-        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("k_len", [32])
-@pytest.mark.parametrize("batch_size", [1])
-@pytest.mark.parametrize("kv_len", [3 * 32])
-@pytest.mark.parametrize("q_len", [3 * 32])
-def test_memory_efficient_attention_full_block_masked(q_len, kv_len, batch_size, k_len):
-    device = "cuda"
-    op_fw = fmha.small_k.FwOp
-    op_bw = fmha.small_k.BwOp
-
-    scale = 3
-    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
-    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-
-    # in this case, most of the blocks in a row get masked
-    attn_bias = torch.full((3, 32), float("-inf"), device=device)
-    attn_bias[:2, :4] = 0
-    attn_bias = attn_bias.flatten()[None, None, :].expand(1, q_len, -1)
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, op=(op_fw, op_bw)
-    )
-    ref = ref_attention(query, key, value, attn_bias)
-
-    assert_allclose(
-        out, ref, atol=op_fw.ERROR_ATOL[query.dtype], rtol=op_fw.ERROR_RTOL[query.dtype]
-    )
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    grad_out = torch.ones_like(query)
-
-    out = xformers.ops.memory_efficient_attention(query, key, value, attn_bias)
-    out.backward(grad_out)
-
-    grad_q = query.grad
-    grad_k = key.grad
-    grad_v = value.grad
-
-    query.grad = None
-    key.grad = None
-    value.grad = None
-
-    ref = ref_attention(query, key, value, attn_bias)
-    ref.backward(grad_out)
-
-    atol = op_bw.ERROR_ATOL[query.dtype]
-    rtol = op_bw.ERROR_RTOL[query.dtype]
-    assert_allclose(grad_q, query.grad, "grad_q", atol=atol, rtol=rtol)
-    assert_allclose(grad_k, key.grad, "grad_k", atol=atol, rtol=rtol)
-    assert_allclose(grad_v, value.grad, "grad_v", atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_lowlevel_api_shapes(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt):
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt=fmt
-    )
-    grad_out = torch.ones_like(query)
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query, key, value, attn_bias
-    )
-    assert out.ndim == query.ndim
-    dq, dk, dv = xformers.ops.memory_efficient_attention_backward(
-        grad_out, out, lse, query, key, value, attn_bias
-    )
-    assert dq.shape == query.shape
-    assert dk.shape == key.shape
-    assert dv.shape == value.shape
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_cuda_streams(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    if device != "cuda":
-        pytest.skip("Not CUDA")
-    bias_type = None
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ]
-    s_hipri = torch.cuda.Stream(priority=-1)
-    s_lopri = torch.cuda.Stream(priority=0)
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK"
-    )
-    torch.cuda.synchronize()
-    with torch.cuda.stream(s_lopri):
-        torch.cuda._sleep(100_000_000)  # wait 100m cycles
-        query *= 2
-    s_hipri.wait_stream(s_lopri)
-    with torch.cuda.stream(s_hipri):
-        # If the kernel is scheduled in the main stream
-        # `query * 2` has not been executed yet
-        out = xformers.ops.memory_efficient_attention(query, key, value, op=(op, None))
-    # Test that `s_lopri` is still sleeping
-    # and that `query *= 2` has not been executed yet
-    query2_main_stream = query * 2
-    torch.cuda.synchronize()
-    # TODO: Figure out why this is failing sometimes
-    # The sleep timer seems to be high enough already ...
-    # assert torch.allclose(query2_main_stream, query), "Need to increase sleep time"
-    del query2_main_stream
-
-    ref = ref_attention(query, key, value)
-    assert out.shape == ref.shape, out.shape
-
-    assert_allclose(
-        out.float(),
-        ref.float(),
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_custom_scale(opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
-    p = 0.0
-    scale = 0.1
-
-    (
-        op_bw,
-        device,
-        dtype,
-        _,
-        B,
-        q_len,
-        kv_len,
-        H,
-        k,
-        Kv,
-    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    torch.manual_seed(q_len + kv_len + k)
-    if device != "cuda":
-        pytest.skip("Not CUDA")
-
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
-    )
-    inputs = fmha.Inputs(
-        query=query, key=key, value=value, attn_bias=attn_bias, scale=scale
-    )
-    op_fw = sample_random_supported_fw(inputs, seed=q_len * k + kv_len * k)
-    grad_out = query.new_ones(B * H, q_len, Kv)
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    reasons = op_fw.not_supported_reasons(inputs)
-    if reasons:
-        pytest.skip(f"{op_fw.NAME}: unsupported ({'/'.join(reasons)})")
-    reasons = op_bw.not_supported_reasons(inputs)
-    if reasons:
-        pytest.skip(f"{op_bw.NAME}: unsupported ({'/'.join(reasons)})")
-
-    # NOTE: we still need to scale the inputs to not blowup
-    # the pre-softmax values (numerical stability)
-    s = k**-0.5
-    out = xformers.ops.memory_efficient_attention(
-        query * s, key, value, attn_bias, p, scale, op=(op_fw, op_bw)
-    )
-    out.backward(grad_out)
-    grad_q, grad_k, grad_v = query.grad, key.grad, value.grad
-    query.grad = key.grad = value.grad = None
-
-    ref = ref_attention(query * s, key, value, attn_bias, None, p, scale)
-    ref.backward(grad_out)
-    ref_grad_q, ref_grad_k, ref_grad_v = query.grad, key.grad, value.grad
-    query.grad = key.grad = value.grad = None
-
-    atol = op_fw.ERROR_ATOL[dtype]
-    rtol = op_fw.ERROR_RTOL[dtype]
-    assert_allclose(out.float(), ref.float(), "out", atol=atol, rtol=rtol)
-    atol = op_bw.ERROR_ATOL[dtype]
-    rtol = op_bw.ERROR_RTOL[dtype]
-    assert_allclose(grad_q, ref_grad_q, "grad_q", atol=atol, rtol=rtol)
-    assert_allclose(grad_k, ref_grad_k, "grad_k", atol=atol, rtol=rtol)
-    assert_allclose(grad_v, ref_grad_v, "grad_v", atol=atol, rtol=rtol)
-
-
-def apply_attention(query, key, value, attn_bias, op_fw, proj):
-    x = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias=attn_bias, op=(op_fw, None)
-    )
-    x = proj(x)
-    return x
-
-
-@pytest.mark.parametrize("use_reentrant", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_grad_checkpointing(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    use_reentrant,
-):
-    fmt = "BMHK"
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    bias_type = None
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    )
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt=fmt,
-    )
-    qkv = None
-
-    if (
-        fmt == "BMHK"
-        and query.shape[3] == value.shape[3]
-        and query.shape[1] == value.shape[1]
-    ):
-        qkv = torch.stack([query, key, value], 2)
-        qkv.requires_grad_(True)
-        # bm3hk -> 3 x bmhk
-        query, key, value = xformers.ops.unbind(qkv, 2)
-        assert not query.is_contiguous()
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    proj = torch.nn.Linear(kv, k, device=device, dtype=dtype)
-
-    x = query
-    for _ in range(5):
-        x = checkpoint(
-            apply_attention,
-            x,
-            key,
-            value,
-            attn_bias,
-            op,
-            proj,
-            use_reentrant=use_reentrant,
-        )
-    x.mean().backward()
-
-
-ALL_FW_OPS_NO_SMALLK = [op for op in ALL_FW_OPS if op is not fmha.small_k.FwOp]
-
-
-@pytest.mark.parametrize(
-    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
-)
-def test_unsupported_cpu(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 1, 1, 32])
-    with pytest.raises(ValueError):
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
-)
-def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 1, 32, 4], device="cuda", dtype=torch.float16).permute(
-        0, 3, 1, 2
-    )
-    try:
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-    except ValueError as e:
-        if "Only work on pre-MLIR triton for now" in str(e):
-            pytest.skip("Only work on pre-MLIR triton for now")
-        q = q.contiguous()
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "op", ALL_FW_OPS_NO_SMALLK, ids=[op.NAME for op in ALL_FW_OPS_NO_SMALLK]
-)
-def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
-    q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
-    try:
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-    except ValueError as e:
-        if "Only work on pre-MLIR triton for now" in str(e):
-            pytest.skip("Only work on pre-MLIR triton for now")
-        q = q.contiguous()
-        fmha.memory_efficient_attention(q, q, q, op=(op, None))
-
-
-def test_attn_bias_causal() -> None:
-    m = -math.inf
-    causal_mask = torch.tensor([[0, m], [0, 0], [0, 0]])
-    tensor_bias = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
-
-    attn_bias = fmha.attn_bias.LowerTriangularMask()
-    assert_allclose(attn_bias.materialize(causal_mask.shape), causal_mask, "causal")
-    attn_bias = attn_bias.add_bias(tensor_bias)
-    assert_allclose(
-        attn_bias.materialize(causal_mask.shape),
-        tensor_bias + causal_mask,
-        "causal+tensor_bias",
-    )
-
-
-def test_attn_bias_torch_tensor() -> None:
-    tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
-    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
-    m = -math.inf
-    causal_bias = torch.tensor([[0, m, m], [0, 0, m]])
-    assert_allclose(
-        attn_bias.materialize((2, 3)), causal_bias + tensor_bias, "tensor_bias+causal"
-    )
-
-
-def test_attn_bias_blockdiag() -> None:
-    queries = [
-        torch.randn([1, 3, 1, 8]),
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([1, 5, 1, 8]),
-    ]
-    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
-
-    # Verify mask
-    as_tensor = attn_bias.materialize((10, 10))
-    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 2 * 2 + 5 * 5
-    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
-    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1")
-    assert_allclose(as_tensor[5:, 5:], torch.zeros([5, 5]), "batch2")
-
-    # Verify we can split it back
-    queries2 = attn_bias.split(q)
-    assert len(queries) == len(queries2)
-    for q1, q2 in zip(queries, queries2):
-        assert_allclose(q1, q2)
-
-
-def test_attn_bias_blockdiag_batched() -> None:
-    queries = [
-        torch.randn([1, 3, 1, 8]),
-        torch.randn([3, 2, 1, 8]),
-        torch.randn([1, 5, 1, 8]),
-    ]
-    attn_bias, q = fmha.BlockDiagonalMask.from_tensor_list(queries)
-
-    # Verify mask
-    as_tensor = attn_bias.materialize((14, 14))
-    assert int((as_tensor != -math.inf).sum().item()) == 3 * 3 + 3 * 2 * 2 + 5 * 5
-    assert_allclose(as_tensor[0:3, 0:3], torch.zeros([3, 3]), "batch0")
-    assert_allclose(as_tensor[3:5, 3:5], torch.zeros([2, 2]), "batch1.0")
-    assert_allclose(as_tensor[5:7, 5:7], torch.zeros([2, 2]), "batch1.1")
-    assert_allclose(as_tensor[7:9, 7:9], torch.zeros([2, 2]), "batch1.2")
-    assert_allclose(as_tensor[9:, 9:], torch.zeros([5, 5]), "batch2")
-
-    # Verify we can split it back
-    queries2 = attn_bias.split(q)
-    assert len(queries) == len(queries2)
-    for q1, q2 in zip(queries, queries2):
-        assert_allclose(q1, q2)
-
-
-def test_attn_bias_blockdiag_crossattn_causal() -> None:
-    # Q / KV have different seqlen
-    list_q = [
-        torch.randn([1, 3, 1, 8]),
-        torch.randn([2, 1, 1, 8]),
-    ]
-    list_k = [
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([2, 3, 1, 8]),
-    ]
-
-    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
-        list_q, list_k
-    )
-
-    # Verify mask
-    as_tensor = attn_bias.materialize((q.shape[1], k.shape[1]))
-    assert int((as_tensor != -math.inf).sum().item()) == 3 * 2 + 2 * 3 * 1
-    assert_allclose(as_tensor[0:3, 0:2], torch.zeros([3, 2]), "batch0")
-    assert_allclose(as_tensor[3:4, 2:5], torch.zeros([1, 3]), "batch1.0")
-    assert_allclose(as_tensor[4:, 5:], torch.zeros([1, 3]), "batch1.1")
-
-    # Also test causal version
-    as_tensor = attn_bias.make_causal().materialize((q.shape[1], k.shape[1]))
-    assert_allclose(
-        as_tensor[3:4, 2:5],
-        fmha.attn_bias.LowerTriangularMask().materialize((1, 3)),
-        "batch1.0[causal]",
-    )
-
-    # Verify we can split it back
-    list_q2 = attn_bias.split_queries(q)
-    assert len(list_q) == len(list_q2)
-    for q1, q2 in zip(list_q, list_q2):
-        assert_allclose(q1, q2)
-    with pytest.raises(ValueError):
-        attn_bias.split_queries(k)
-    list_k2 = attn_bias.split_kv(k)
-    assert len(list_k) == len(list_k2)
-    for k1, k2 in zip(list_k, list_k2):
-        assert_allclose(k1, k2)
-
-
-def test_attn_bias_blockdiag_crossattn_causal_with_prefix_qk_cond() -> None:
-    list_q = [
-        torch.randn([1, 3, 1, 8]),
-    ]
-    list_k = [
-        torch.randn([1, 2, 1, 8]),
-    ]
-    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
-        list_q, list_k
-    )
-    with pytest.raises(ValueError):
-        attn_bias.make_causal_from_bottomright()
-
-
-def test_attn_bias_blockdiag_crossattn_causal_with_prefix() -> None:
-    # Q / KV have different seqlen
-    list_q = [
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([2, 2, 1, 8]),
-    ]
-    list_k = [
-        torch.randn([1, 2, 1, 8]),
-        torch.randn([2, 5, 1, 8]),
-    ]
-
-    attn_bias, q, k, _ = fmha.attn_bias.BlockDiagonalMask.from_tensor_lists_qkv(
-        list_q, list_k
-    )
-    as_tensor = attn_bias.make_causal_from_bottomright().materialize(
-        (q.shape[1], k.shape[1])
-    )
-    m = -math.inf
-    assert_allclose(
-        as_tensor[0:2, 0:2],
-        torch.tensor([[0, m], [0, 0]], dtype=torch.float32),
-        "batch1.1[causal_with_prefix]",
-    )
-    assert_allclose(
-        as_tensor[2:4, 2:7],
-        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
-        "batch2.1[causal_with_prefix]",
-    )
-    assert_allclose(
-        as_tensor[4:6, 7:12],
-        torch.tensor([[0, 0, 0, 0, m], [0, 0, 0, 0, 0]], dtype=torch.float32),
-        "batch2.2[causal_with_prefix]",
-    )
-
-
-@cuda_only
-def test_attn_bias_padded() -> None:
-    bsize, n_heads, d, padding = 8, 3, 8, 32
-
-    # Q / KV have different seqlen
-    k = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
-    k_seqlen = [5, 8, 7, 1, 9, 3, 12, 32]
-    other = bsize - 1
-    v = torch.randn((bsize, padding, n_heads, d), device="cuda", dtype=torch.float16)
-    n_q_first = 4
-    q = [
-        torch.randn((1, n_q_first, n_heads, d), device="cuda", dtype=torch.float16),
-        torch.randn((1, other, n_heads, d), device="cuda", dtype=torch.float16),
-    ]
-    q_cat = torch.cat([x.view(1, -1, n_heads, d) for x in q], dim=1)
-    q_seqlen = [n_q_first] + [1] * other
-
-    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=q_seqlen,
-        kv_seqlen=k_seqlen,
-        kv_padding=padding,
-    )
-
-    v = v.view(1, -1, n_heads, d)
-    k = k.view(1, -1, n_heads, d)
-
-    scores = (q_cat.transpose(1, 2) @ k.transpose(1, 2).transpose(2, 3)).float()
-    assert not scores.isnan().any()
-    mask = torch.full_like(scores, -float("inf"))
-    for i, (slen, qlen) in enumerate(zip(k_seqlen, q_seqlen)):
-        kseq_start = i * padding
-        qstart = sum(q_seqlen[:i])
-        mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen] = torch.triu(
-            mask[:, :, qstart : qstart + qlen, kseq_start : kseq_start + slen].float(),
-            diagonal=1 + slen - qlen,
-        ).float()
-
-    scores += mask
-    assert not scores.isnan().any()
-    # 1,3,10,8 @ 1,3,8,256 -> 1,3,10,256
-    scores = torch.nn.functional.softmax(scores, -1).half()
-    # torch.Size([1, 3, 3, 32]) @ torch.Size([1, 3, 32, 8])
-    output = scores @ v.transpose(1, 2)  # 1,3,10,256 @ 1,3,256, 8 -> 1,3,10,8
-    output = output.transpose(1, 2).contiguous()
-
-    fmha_output = fmha.memory_efficient_attention_forward(
-        q_cat, k, v, attn_bias, scale=1.0, op=fmha.ck.FwOp
-    )
-
-    # assert torch.allclose(output, fmha_output)
-    assert_allclose(
-        output,
-        fmha_output,
-        atol=fmha.cutlass.FwOp.ERROR_ATOL[torch.float16],
-        rtol=fmha.cutlass.FwOp.ERROR_RTOL[torch.float16],
-    )
-
-
-def _kv_heads_label(kv_heads: Optional[int]) -> str:
-    if kv_heads is None:
-        return ""
-    if kv_heads == 1:
-        return "mq"
-    return f"gqa{kv_heads}"
-
-
-@pytest.mark.parametrize("dtype", ["f32"])
-@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("n_heads", [16])
-@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
-@pytest.mark.parametrize("split_k", [1, 2, 4])
-def test_splitk_reference(
-    kv_heads: int, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
-):
-    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
-    torch.manual_seed(1)
-    d = 256
-    num_queries = 1
-    if kv_heads is not None and kv_heads > 1:
-        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
-        q_shape: Tuple[int, ...] = (
-            1,
-            bsz * num_queries,
-            kv_heads,
-            n_heads,
-            d,
-        )
-    else:
-        k_shape = (1, bsz * padding, n_heads, d)
-        q_shape = (1, bsz * num_queries, n_heads, d)
-
-    k = torch.rand(k_shape, dtype=dtype_).cuda()
-    k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
-    v = torch.rand_like(k)
-    q = torch.rand(q_shape, dtype=dtype_).cuda()
-    causal_diagonal = torch.tensor(  # TODO: make unnecessary
-        [i - 1 for i in k_seqlen], dtype=torch.int32
-    ).cuda()
-
-    if kv_heads is not None:
-        k = k[..., :1, :].expand(k_shape)
-        v = v[..., :1, :].expand(k_shape)
-
-    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[1] * bsz,
-        kv_seqlen=k_seqlen,
-        causal_diagonal=causal_diagonal,
-        kv_padding=padding,
-    )
-    ref_out = ref_attention(q, k, v, attn_bias)
-    splitk_out = ref_attention_splitk(q, k, v, attn_bias, None, split_k=split_k)
-    assert_allclose(
-        ref_out,
-        splitk_out,
-        atol=fmha.ck.FwOp.ERROR_ATOL[dtype_],
-        rtol=fmha.ck.FwOp.ERROR_RTOL[dtype_],
-    )
-
-
-@pytest.mark.parametrize("op", [fmha.ck_decoder.FwOp])
-@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("bsz,n_heads", [(1, 1), (1, 16), (1, 32), (8, 1), (4, 8)])
-@pytest.mark.parametrize("padding", [32, 4096])
-@pytest.mark.parametrize("dtype", ["f16", "bf16", "f32"])
-@pytest.mark.parametrize("d", [256])
-def test_decoder(
-    op,
-    n_heads: int,
-    kv_heads: Optional[int],
-    padding: int,
-    bsz: int,
-    dtype: str,
-    d: int,
-    dequant: bool = False,
-    num_queries: int = 1,
-) -> None:
-    # kv_heads = 1: multiquery
-    # kv_heads = None: neither MQA nor GQA
-    # kv_heads > 1: BMGHK
-    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float}[dtype]
-    tensor_options = {"dtype": dtype_, "device": "cuda"}
-    torch.manual_seed(1)
-    num_queries = 1
-    if kv_heads is not None and kv_heads > 1:
-        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
-        q_shape: Tuple[int, ...] = (
-            1,
-            bsz * num_queries,
-            kv_heads,
-            n_heads,
-            d,
-        )
-    else:
-        k_shape = (1, bsz * padding, n_heads, d)
-        q_shape = (1, bsz * num_queries, n_heads, d)
-
-    k = torch.randn(k_shape, **tensor_options)
-    k_seqlen = torch.randint(num_queries, padding + 1, (bsz,)).tolist()
-    v = torch.randn_like(k)
-    q = torch.randn(q_shape, **tensor_options)
-    causal_diagonal = torch.tensor(  # TODO: make unnecessary
-        [i - 1 for i in k_seqlen], dtype=torch.int32
-    ).cuda()
-
-    if kv_heads is not None:
-        k = k[..., :1, :].expand(k_shape)
-        v = v[..., :1, :].expand(k_shape)
-
-    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[num_queries] * bsz,
-        kv_seqlen=k_seqlen,
-        causal_diagonal=causal_diagonal,
-        kv_padding=padding,
-    )
-    inp = fmha.Inputs(q, k, v, attn_bias=attn_bias)
-    if not_supported_reasons := op.not_supported_reasons(inp):
-        pytest.skip(f"{not_supported_reasons=}")
-
-    decoder_output = fmha.memory_efficient_attention_forward(q, k, v, attn_bias, op=op)
-
-    ref_output = ref_attention(q, k, v, attn_bias)
-
-    assert_allclose(
-        decoder_output.float(),
-        ref_output,
-        atol=fmha.ck_decoder.FwOp.ERROR_ATOL[dtype_] * 4,
-        rtol=fmha.ck_decoder.FwOp.ERROR_RTOL[dtype_],
-    )
-
-
-@pytest.mark.parametrize(
-    "op", [fmha.ck_splitk.FwOp_S1, fmha.ck_splitk.FwOp_S2, fmha.ck_splitk.FwOp_S4]
-)
-@pytest.mark.parametrize("dtype", ["f32"])
-@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("n_heads", [16])
-@pytest.mark.parametrize("d", [256])
-@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1), (32, 1), (4096, 8)])
-def test_splitk_decoder(
-    op,
-    kv_heads: Optional[int],
-    n_heads: int,
-    padding: int,
-    bsz: int,
-    dtype: str,
-    d: int,
-) -> None:
-    # no quantized impl compared to cuda
-    test_decoder(
-        op,
-        kv_heads=kv_heads,
-        n_heads=n_heads,
-        padding=padding,
-        bsz=bsz,
-        dtype=dtype,
-        d=d,
-    )
-
-
-def test_attn_bias_from_seqlens() -> None:
-    bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens([3, 5, 1])
-    out = bias.split(torch.randn([1, 3 + 5 + 1, 16]))
-    assert len(out) == 3
-    assert tuple(out[0].shape) == (1, 3, 16)
-
-
-@cuda_only
-def test_attn_bias_blockdiag_doc() -> None:
-    """IMPORTANT:
-    This is the example in the doc for `BlockDiagonalMask`.
-    If this example needs to be updated, please also update the doc
-    """
-    import torch
-
-    from xformers.ops import fmha
-
-    K = 16
-    dtype = torch.float16
-    device = "cuda"
-    list_x = [
-        torch.randn([1, 3, 1, K], dtype=dtype, device=device),
-        torch.randn([1, 6, 1, K], dtype=dtype, device=device),
-        torch.randn([1, 2, 1, K], dtype=dtype, device=device),
-    ]
-    attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
-
-    linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)  # type: ignore
-
-    q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
-    out = fmha.memory_efficient_attention(
-        q, k, v, attn_bias=attn_bias, op=(fmha.ck.FwOp, None)
-    )
-    list_out = attn_bias.split(out)
-    assert tuple(list_out[0].shape) == (1, 3, 1, K)
-
-
-@cuda_only
-class TestAttnBias:
-    @staticmethod
-    def create_tensors(
-        dtype,
-        B: int = 2,
-        Mq: int = 32,
-        Mkv: int = 32,
-        H: int = 3,
-        K: int = 16,
-        Kv: int = 16,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        return (
-            torch.randn([B, Mq, H, K], device="cuda", dtype=dtype) * 3,
-            torch.randn([B, Mkv, H, K], device="cuda", dtype=dtype) * 3,
-            torch.randn([B, Mkv, H, Kv], device="cuda", dtype=dtype) * 3,
-            torch.randn([B, H, Mq, Mkv], device="cuda", dtype=dtype) * 3,
-        )
-
-    @staticmethod
-    def pad_bias(bias: torch.Tensor) -> torch.Tensor:
-        align_to = 16
-        if (bias.shape[-1] % align_to) == 0:
-            return bias
-        pad_count = align_to - (bias.shape[-1] % align_to)
-        return torch.nn.functional.pad(bias, [0, pad_count])[:, :, :, : bias.shape[-1]]
-
-    def test_f16_biasf32(self) -> None:
-        q, k, v, bias = self.create_tensors(torch.float16)
-        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-        bias = bias.to(torch.float32)
-        with pytest.raises((ValueError, RuntimeError)):
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-
-    def test_f32_biasf16(self) -> None:
-        q, k, v, bias = self.create_tensors(torch.float32)
-        fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-        bias = bias.to(torch.float16)
-        with pytest.raises((ValueError, RuntimeError)):
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias)
-
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
-    def test_wrong_alignment(self, dtype) -> None:
-        op = fmha.cutlass.FwOp
-        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
-        try:
-            fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=(op, None))
-            return
-        except (ValueError, RuntimeError):
-            pass
-        # This case is not supported, likely due to padding issues
-        # Let's make sure it works with padding
-        assert bias.ndim == 4, bias.shape
-        bias_padded = self.pad_bias(bias)
-        out = fmha.memory_efficient_attention(
-            q, k, v, attn_bias=bias_padded, op=(op, None)
-        ).float()
-        ref_out = ref_attention_bmhk(q, k, v, bias)
-        assert_allclose(
-            out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
-        )
-
-    def test_permuted_attn_bias(self) -> None:
-        op = fmha.cutlass.FwOp
-        dtype = torch.float16
-        q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=7)
-        bias = bias.transpose(-1, -2)  # now `stride(-1) != 1`
-        # Either it works, or it raises an exception
-        # but we should never get a CUDA error
-        try:
-            out = fmha.memory_efficient_attention(
-                q, k, v, attn_bias=bias, op=(op, None)
-            ).float()
-            ref_out = ref_attention_bmhk(q, k, v, bias)
-            assert_allclose(
-                out, ref_out, atol=op.ERROR_ATOL[dtype], rtol=op.ERROR_RTOL[dtype]
-            )
-        except (ValueError, RuntimeError):
-            pass
-
-
-SM_AND_SHMEM_KBYTES = [
-    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
-    (50, 64),
-    (60, 64),
-    (70, 96),
-    (75, 64),
-    (80, 163),
-    (86, 99),
-    (89, 99),
-    # (90, 227),
-]
-
-
-@cuda_only
-@pytest.mark.parametrize("dtype_str", ["f32", "f16", "bf16"])
-@pytest.mark.parametrize(
-    "sm_shmem",
-    SM_AND_SHMEM_KBYTES,
-    ids=[f"cc{sm}_shmem{shmem}kb" for sm, shmem in SM_AND_SHMEM_KBYTES],
-)
-def test_has_kernel_for(sm_shmem: Tuple[int, int], dtype_str: str) -> None:
-    dtype = {"f32": torch.float, "f16": torch.half, "bf16": torch.bfloat16}[dtype_str]
-    sm, shmem_kbytes = sm_shmem
-    if sm < 80 and dtype_str == "bf16":
-        return
-
-    for k in [16, 32, 64, 128, 256]:
-        assert torch.ops.xformers._has_cutlassF_kernel_for(
-            dtype, sm, shmem_kbytes * 1024, k
-        ), f"k={k}"
-        assert torch.ops.xformers._has_cutlassB_kernel_for(
-            dtype, sm, shmem_kbytes * 1024, k
-        ), f"k={k}"
-
-
-def test_window_size_materialize() -> None:
-    seqlens = [4, 6]
-    attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-        q_seqlen=seqlens,
-        kv_seqlen=seqlens,
-    ).make_local_attention(2)
-    mask = attn_bias.materialize(
-        (1, 1, sum(seqlens), sum(seqlens)),
-        device="cpu",
-        dtype=torch.float32,
-    )
-    true_mask = torch.log(
-        torch.Tensor(
-            [
-                [
-                    [
-                        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0],
-                    ]
-                ]
-            ]
-        )
-    )
-    assert torch.all(mask == true_mask)
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "opFW_biasT",
-    [
-        (op, biasT)
-        for op in ALL_FW_OPS
-        for biasT in op.SUPPORTED_ATTN_BIAS_TYPES
-        if op.SUPPORTS_BMGHK
-    ],
-)
-def test_forward_gqa(opFW_biasT):
-    opFW, biasT = opFW_biasT
-    B_Mq_Mkv_H_K_Kv = (3, 512, 512, 16, 128, 128)
-    test_forward(
-        (
-            opFW,
-            "cuda",
-            torch.float16,
-            biasT,
-            *B_Mq_Mkv_H_K_Kv,
-        ),
-        packed=False,
-        fmt="BMGHK",
-        g=2,
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize(
-    "opBW",
-    [
-        fmha.flash.BwOp,
-        fmha.cutlass.BwOp,
-    ],
-)
-def test_backward_gqa(opBW):
-    H = 8
-    B_Mq_Mkv_H_K_Kv = (3, 512, 512, H, 128, 128)
-    dtype = torch.float16
-    query, key, value, attn_bias = create_tensors(
-        *(opBW, "cuda", dtype, type(None), *B_Mq_Mkv_H_K_Kv),
-        attn_bias_requires_grad=False,
-        fmt="BMHK",
-    )
-    op = (fmha.cutlass.FwOp, opBW)
-    key = key[:, :, :1].expand(-1, -1, H, -1)
-    value = value[:, :, :1].expand(-1, -1, H, -1)
-    key.requires_grad_(True)
-    out = fmha.memory_efficient_attention(query, key, value, attn_bias=attn_bias)
-    out_ref = ref_attention_bmhk(query, key, value, attn_bias=attn_bias)
-    assert_allclose(
-        out.float(),
-        out_ref.float(),
-        atol=op[0].ERROR_ATOL[dtype],
-        rtol=op[0].ERROR_RTOL[dtype],
-    )
-    out.backward(query)
-    dk = key.grad
-    key.grad = None
-    out_ref.backward(query)
-    assert_allclose(
-        dk.float(),
-        key.grad.float(),
-        atol=op[1].ERROR_ATOL[dtype],
-        rtol=op[1].ERROR_RTOL[dtype],
-    )
-
-
-@cuda_only
-@pytest.mark.parametrize("opFW", [op for op in ALL_FW_OPS if op.SUPPORTS_BMGHK])
-def test_forward_gqa_one_group(opFW):
-    dtype = torch.float16
-    B, Mq, Mkv, H, K = 3, 13, 16, 5, 128
-    q = torch.randn([B, Mq, 1, H, K], dtype=dtype, device="cuda") * 3
-    k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
-    v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
-
-    supported = opFW.supports(fmha.Inputs(q, k, v))
-    if not supported:
-        supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
-        assert supported == supported_bmhk
-        pytest.skip("not supported")
-    out = fmha.memory_efficient_attention_forward(q, k, v, op=opFW)
-    ref = ref_attention(q, k, v)
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=opFW.ERROR_ATOL[dtype],
-        rtol=opFW.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-"""
-@sm80_or_better_only
-def test_flash_gqa_wrong_strides() -> None:
-    op = (fmha.flash.FwOp, None)
-    device = "cuda"
-    B, Mq, Mkv, G, H, K = 3, 1, 512, 2, 8, 128
-    q = torch.empty((B, Mq, G, H, K), dtype=torch.float16, device=device)
-    kv = torch.empty((B, Mkv, G, H, K), dtype=torch.float16, device=device)
-    fmha.memory_efficient_attention(q, kv, kv, op=op)
-
-    kv = torch.empty((B, Mkv, H, G, K), dtype=torch.float16, device=device).permute(
-        0, 1, 3, 2, 4
-    )
-    with pytest.raises(ValueError):
-        fmha.memory_efficient_attention(q, kv, kv, op=op)
-
-    kv = torch.empty((B, Mkv, G, 1, K), dtype=torch.float16, device=device)
-    with pytest.raises(ValueError):
-        fmha.memory_efficient_attention(q, kv, kv, op=op)
-    kv = kv.expand(-1, -1, -1, H, K)
-    fmha.memory_efficient_attention(q, kv, kv, op=op)
-
-    kv = torch.empty((B, Mkv, G, H, 2 * K), dtype=torch.float16, device=device)[
-        :, :, :, :, :K
-    ]
-    fmha.memory_efficient_attention(q, kv, kv, op=op)
-"""
-
-
-def _dispatches_to_splitK(q, kv):
-    return (
-        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0]
-        is fmha.triton_splitk.FwOp
-    )
-
-
-def _dispatches_to_flash_decoding(q, kv):
-    return (
-        _dispatch_fw_priority_list(fmha.Inputs(q, kv, kv), False)[0] is fmha.flash.FwOp
-    )
-
-
-def test_dispatch_decoding_bmhk() -> None:
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 1, 128]), torch.empty([1, 2048, 1, 128])
-    ), "Should not use SplitK with 1 head (no tensorcores)"
-    assert _dispatches_to_flash_decoding(
-        torch.empty([1, 8, 32, 128]),
-        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
-    ), "Should use Flash-Decoding with BMHK MQA"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 32, 128]),
-        torch.empty([1, 2048, 32, 128]),
-    ), "Should not use SplitK when no TensorCores"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 128, 32, 128]),
-        torch.empty([1, 2048, 1, 128]).expand(-1, -1, 32, -1),
-    ), "Should not use SplitK if q seqlen is long"
-    assert not _dispatches_to_splitK(
-        torch.empty([128, 8, 32, 128]),
-        torch.empty([128, 2048, 1, 128]).expand(-1, -1, 32, -1),
-    ), "Should not use SplitK if B is big"
-
-
-def test_dispatch_decoding_bmghk() -> None:
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 1, 1, 128]), torch.empty([1, 2048, 1, 1, 128])
-    ), "Should not use SplitK with 1 head (no tensorcores)"
-    assert _dispatches_to_flash_decoding(
-        torch.empty([1, 8, 1, 32, 128]),
-        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should use Flash-Decoding with MQA"
-    assert _dispatches_to_flash_decoding(
-        torch.empty([1, 8, 4, 32, 128]),
-        torch.empty([1, 2048, 4, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should use Flash-Decoding with GQA"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 8, 1, 32, 128]),
-        torch.empty([1, 2048, 1, 32, 128]),
-    ), "Should not use SplitK when no TensorCores"
-    assert not _dispatches_to_splitK(
-        torch.empty([1, 128, 1, 32, 128]),
-        torch.empty([1, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should not use SplitK if q seqlen is long"
-    assert not _dispatches_to_splitK(
-        torch.empty([128, 8, 1, 32, 128]),
-        torch.empty([128, 2048, 1, 1, 128]).expand(-1, -1, -1, 32, -1),
-    ), "Should not use SplitK if B is big"
-
-
-shapes_triton_splitk = [
-    (1, 8, 2**16, 1, 128, 128),
-    (1, 4, 2**16, 1, 128, 128),
-    (1, 16, 2**16, 1, 128, 128),
-    (1, 16, 2**16, 1, 32, 32),
-    (1, 8, 1025, 1, 128, 128),
-    (2, 8, 4096, 1, 128, 128),
-    (10, 8, 2**16, 1, 128, 128),
-    (10, 15, 2**16, 1, 128, 128),
-    (1, 3, 2**16, 1, 128, 128),
-    (1, 3, 2**16 - 10, 1, 128, 128),
-    (2, 3, 73, 1, 128, 128),
-    (2, 7, 7328, 1, 128, 128),
-    (2, 7, 7328, 1, 120, 120),
-    (2, 7, 63, 1, 120, 120),
-]
-op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk = [
-    (fmha.triton_splitk.FwOp, "cuda", torch.float16, type(None), *s)
-    for s in shapes_triton_splitk
-] + [
-    (fmha.triton_splitk.FwOp, "cuda", torch.bfloat16, type(None), *s)
-    for s in shapes_triton_splitk
-]
-
-
-@pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk,
-    ids=[make_id(*c) for c in op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv_splitk],
-)
-@cuda_only
-def test_forward_splitk(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed=False,
-    fmt="BMHK",
-):
-    test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed=packed, fmt=fmt)
-
-
-@cuda_only
-@pytest.mark.parametrize("op", [fmha.triton_splitk.FwOp])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize(
-    "B_Mkv_H_K",
-    [
-        (1, 2**16, 3, 128),
-        (5, 53, 4, 64),
-    ],
-)
-def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
-    B, Mkv, H, K = B_Mkv_H_K
-    q = torch.randn([B, 1, H, K], dtype=dtype, device="cuda") * 3
-    k = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
-    v = torch.randn([B, Mkv, 1, K], dtype=dtype, device="cuda") * 3
-    k = k.expand(-1, -1, H, -1)
-    v = v.expand(-1, -1, H, -1)
-
-    if not op.supports(fmha.Inputs(q, k, v)):
-        pytest.skip("not supported")
-    out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
-    ref = ref_attention(q, k, v)
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_empty_tensors_empty_query(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK",
-    )
-    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
-
-    query = query[:, :0]
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
-    assert out.shape[1] == 0
-    out.backward(out)
-    # dK/dV should be all zeros
-    assert_allclose(key.grad, torch.zeros_like(key.grad), "key.grad")
-    assert_allclose(value.grad, torch.zeros_like(value.grad), "value.grad")
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_empty_tensors_empty_kv(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK",
-    )
-    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
-
-    key = key[:, :0]
-    value = value[:, :0]
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
-    assert_allclose(out, torch.zeros_like(out), "out")
-    out.backward(out)
-    # dQ should be all zeros
-    assert_allclose(query.grad, torch.zeros_like(query.grad), "query.grad")
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs
-def test_empty_tensors_empty_b(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-):
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        fmt="BMHK",
-    )
-    opFW = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv[0]
-
-    query, key, value = query[:0], key[:0], value[:0]
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-    out = xformers.ops.memory_efficient_attention(query, key, value, op=(opFW, None))
-    out.backward(out)
-
-
-def test_local_attn_bias() -> None:
-    mask = (
-        fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
-        .materialize(shape=(4, 4))
-        .exp()
-    )
-
-    expected = torch.tensor(
-        [[1, 1, 1, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.float32
-    )
-    assert (mask == expected).all().item()
-
-
-@cuda_only
-@pytest.mark.parametrize("cc", [60, 70, 80])
-@pytest.mark.parametrize("maxK", [32, 64, 128, 256])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
-@pytest.mark.parametrize(
-    "custom_mask_type",
-    [
-        fmha.cutlass._CustomMaskType.NoCustomMask,
-        fmha.cutlass._CustomMaskType.CausalFromTopLeft,
-        fmha.cutlass._CustomMaskType.CausalFromBottomRight,
-    ],
-)
-@pytest.mark.parametrize("window_size", [0, 3, 300])
-@pytest.mark.parametrize(
-    "num_queries,num_keys",
-    [
-        (30, 66),
-        (256, 256),
-        # Edge cases
-        (314, 320),
-        (32, 256),
-        (224, 226),
-        (5, 531),
-        (320, 332),  # for win_size=300
-        # Others
-        (256, 62),
-        (256, 63),
-        (256, 64),
-        (256, 65),
-        (256, 66),
-    ],
-)
-def test_cutlassB_iter_order(
-    dtype,
-    cc: int,
-    maxK: int,
-    num_queries: int,
-    num_keys: int,
-    custom_mask_type,
-    window_size,
-) -> None:
-    """
-    This tests some internals of the cutlassB kernel
-    We test the iteration across blocks of [queries, keys] to ensure
-    that we correctly:
-    * Iterate over all the blocks that should be iterated
-    * Do *not* iterate over blocks that are completely masked out
-    * Correctly compute the number of parallel blocks that will compute
-        the same block of dQ
-    .. and we test this across variable causal masks+local attention combinations
-    """
-    if (
-        window_size > 0
-        and custom_mask_type == fmha.cutlass._CustomMaskType.NoCustomMask
-    ):
-        pytest.skip("LocalAttention is only supported for causal")
-    get_iteration_data = partial(
-        torch.ops.xformers._cutlassB_iteration_data,
-        dtype=dtype,
-        cc=cc,
-        maxK=maxK,
-        num_queries=num_queries,
-        num_keys=num_keys,
-        custom_mask_type=custom_mask_type,
-        window_size=window_size,
-    )
-    bias = torch.zeros([num_queries, num_keys], dtype=torch.float32)
-    if custom_mask_type != fmha.cutlass._CustomMaskType.NoCustomMask:
-        bias = fmha.attn_bias._materialize_causal_mask(
-            (num_queries, num_keys),
-            dtype=torch.float32,
-            device="cpu",
-            window_size=None if window_size == 0 else window_size,
-            from_bottomright=(
-                custom_mask_type == fmha.cutlass._CustomMaskType.CausalFromBottomRight
-            ),
-        )
-
-    block_queries, block_keys = get_iteration_data()[:2]
-    mask_pooled = (
-        F.max_pool2d(bias.unsqueeze(0), (block_queries, block_keys), ceil_mode=True)
-        == 0
-    ).int()[0]
-    attn_computed = torch.zeros_like(mask_pooled)
-    for key_start in range(0, num_keys, block_keys):
-        it = 0
-        new_key_start = key_start
-        new_query_start = get_iteration_data(key_start=key_start)[2]
-        try:
-            expected_first_query = (
-                mask_pooled[:, key_start // block_keys].tolist().index(1)
-                * block_queries
-            )
-            assert (
-                new_query_start == expected_first_query
-            ), f"Wrong first query for K={key_start}: {new_query_start} (expected {expected_first_query})"
-        except ValueError:  # Nothing to compute in this column
-            pass
-
-        while new_key_start == key_start and new_query_start < num_queries:
-            query_start = new_query_start
-            attn_computed[query_start // block_queries, key_start // block_keys] += 1
-            # print(f"Compute [{query_start}, {key_start}]")
-
-            # Is there something to compute here?
-            assert mask_pooled[
-                query_start // block_queries, key_start // block_keys
-            ].item(), "Computing a block that is not needed!"
-            new_query_start, new_key_start = get_iteration_data(
-                key_start=key_start, query_start=query_start
-            )[3:5]
-            it += 1
-            assert it < num_queries, ""
-        assert (attn_computed == mask_pooled)[
-            :, key_start // block_keys
-        ].all(), "some blocks were not computed!"
-
-    # Now check that the number returned by `getNumParallelBlocksForQuery` is correct
-    for query_start in range(0, num_queries, block_queries):
-        num_parallel_blocks = get_iteration_data(
-            query_start=query_start, num_splits_key=num_keys
-        )[5]
-        num_actual = mask_pooled[query_start // block_queries].sum().item()
-        assert num_parallel_blocks == num_actual
-
-
-# end of file
diff --git a/tests/test_mqa_forward_ck_tiled_discarded.py b/tests/test_mqa_forward_ck_tiled_discarded.py
deleted file mode 100644
index c40bd5708..000000000
--- a/tests/test_mqa_forward_ck_tiled_discarded.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Sequence, Type, TypeVar
-
-import pytest
-import torch
-
-import xformers.ops
-from xformers.attn_bias_utils import create_attn_bias
-from xformers.ops import fmha
-from xformers.ops.common import get_xformers_operator
-
-from .utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-_types = [torch.float16, torch.bfloat16]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-# ck_check_op is temporarily used to check ck-tiled availability
-ck_check_op = get_xformers_operator("is_ck_tiled_used")
-use_ck_tiled = ck_check_op()
-
-
-def ref_attention(
-    q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None, dtype=None
-):
-    if q.ndim == 4:
-        B, M, Hq, K = q.shape
-        _, N, Hkv, Kv = v.shape
-        nhead_ratio_qk = Hq // Hkv
-
-        def attn_bias_head(head: int):
-            if isinstance(attn_bias, torch.Tensor):
-                assert attn_bias.ndim == 4
-                _, H, _, _ = attn_bias.shape
-                assert H == Hq
-                bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-                return bias_bghmn[:, :, head]
-            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                assert attn_bias._bias.ndim == 4
-                _, H, _, _ = attn_bias._bias.shape
-                assert H == Hq
-                bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-
-                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    bias_bghmn[:, :, head]
-                )
-            return attn_bias
-
-        q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
-
-        return torch.stack(
-            [
-                ref_attention_bmhk(
-                    q_bmghk[:, :, :, h], k, v, attn_bias=attn_bias_head(h), dtype=dtype
-                )
-                for h in range(q_bmghk.shape[3])
-            ],
-            dim=3,
-        ).reshape((B, M, Hq, Kv))
-
-    assert q.ndim == 3
-    if dtype is None:
-        dtype = torch.float32
-    q = q.to(dtype=dtype)
-    k = k.to(dtype=dtype)
-    v = v.to(dtype=dtype)
-
-    scale = scale if scale is not None else (q.shape[-1] ** -0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=dtype,
-            )
-        else:
-            attn_bias_tensor = attn_bias.to(dtype=dtype)
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None, dtype=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale, dtype=dtype)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
-@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
-@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000)])
-@pytest.mark.parametrize("batches", [100, 64, 1])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize(
-    "attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask]
-)
-@pytest.mark.parametrize("op", ALL_FW_OPS)
-def test_mqa_forward(
-    op,
-    attn_bias_type,
-    dtype,
-    batches: int,
-    seqlen_kv: int,
-    seqlen_q: int,
-    nhead_kv: int,
-    nhead_q: int,
-    hdim_v: int,
-    hdim_k: int,
-):
-    B = batches
-    M = seqlen_q
-    N = seqlen_kv
-    Hq = nhead_q
-    Hkv = nhead_kv
-    K = hdim_k
-    Kv = hdim_v
-    nhead_ratio_qk = Hq // Hkv
-
-    device = torch.device("cuda")
-
-    if not use_ck_tiled:
-        pytest.skip("mqa/gqa is only supported with ck-tiled")
-
-    torch.manual_seed(B * M + N * K + Hq * Hkv + Kv)
-
-    scale = 3
-    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
-    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
-    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
-
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=Hq,
-            num_heads_groups=nhead_ratio_qk,
-            q_len=M,
-            kv_len=N,
-            dtype=dtype,
-            device=device,
-            requires_grad=False,
-            fmt="BMHK",
-            op=op,
-        )
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        assert False, err_msg
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
deleted file mode 100644
index 4a4a06d71..000000000
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic.cpp
+++ /dev/null
@@ -1,573 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <cmath>
-#include <cstdlib>
-
-#include <ATen/Context.h>
-#include <ATen/ScalarOps.h>
-#include <ATen/Tensor.h>
-#include <ATen/TensorOperators.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "ck_fmha_params.h"
-#include "ck_fmha_util.h"
-
-extern void batched_backward_fp16(
-    BatchedBackwardParams& param,
-    hipStream_t stream);
-extern void batched_backward_bp16(
-    BatchedBackwardParams& param,
-    hipStream_t stream);
-extern void grouped_backward_fp16(
-    GroupedBackwardParams& param,
-    hipStream_t stream);
-extern void grouped_backward_bp16(
-    GroupedBackwardParams& param,
-    hipStream_t stream);
-
-namespace {
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-efficient_attention_backward_ck(
-    const at::Tensor& grad_out,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const c10::optional<at::Tensor>& bias, // additive attention bias
-    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
-    // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& seqstart_q,
-    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
-    // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& seqstart_k,
-    // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
-    const c10::optional<at::Tensor>& seqlen_k,
-    const at::Tensor& logsumexp,
-    const at::Tensor& out,
-    double dropout_p, // dropout probability
-    int64_t rng_seed, // seed using for generating random numbers for dropout
-    int64_t rng_offset, // offset into random number sequence
-    int64_t custom_mask_type,
-    const c10::optional<double> scale) {
-#ifdef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
-  TORCH_CHECK(
-      false,
-      "MemoryEfficient build has been disabled at build time with "
-      "-DXFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD");
-#else
-  at::globalContext().alertNotDeterministic(
-      "mem_efficient_attention_backward_cutlass");
-
-  // ndim
-  TORCH_CHECK(query.dim() == grad_out.dim());
-  TORCH_CHECK(query.dim() == key.dim());
-  TORCH_CHECK(query.dim() == value.dim());
-  TORCH_CHECK(query.dim() == 4);
-
-  // batch size
-  TORCH_CHECK(query.size(0) == grad_out.size(0));
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // seqlen
-  TORCH_CHECK(key.size(1) == value.size(1));
-  TORCH_CHECK(query.size(1) == grad_out.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) % key.size(2) == 0);
-  TORCH_CHECK(key.size(2) == value.size(2));
-  TORCH_CHECK(query.size(2) == grad_out.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-  TORCH_CHECK(value.size(3) == grad_out.size(3));
-
-  // CK-FlashAttn requires out, grad_out to have same shapes
-  TORCH_CHECK(out.sizes() == grad_out.sizes());
-  TORCH_CHECK(out.strides() == grad_out.strides());
-
-  // last dim is contiguous, device is CUDA
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  // logsumexp should be completely contiguous
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  TORCH_CHECK(
-      !(seqstart_q.has_value() && bias.has_value()),
-      "seqstart_q + bias not supported");
-
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-  }
-
-  bool use_fp32_qkv_grad = false;
-
-  if (const char* env_str = std::getenv("USE_FP32_QKV_GRAD")) {
-    use_fp32_qkv_grad = (std::stoi(env_str) > 0) ? true : false;
-  };
-
-  // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t Hq = query.size(2);
-  int64_t Hkv = key.size(2);
-  int64_t K = query.size(3);
-  int64_t Kv = value.size(3);
-
-  auto opts = query.options();
-
-  at::Tensor grad_q, grad_k, grad_v, grad_bias;
-
-  if (query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
-      query.size(2) == key.size(2) &&
-      query.storage().is_alias_of(key.storage()) &&
-      query.storage().is_alias_of(value.storage())) {
-    // Create one big contiguous chunk for grad_q, grad_k, grad_v
-    // This is because q, k and v usually come from a single
-    // output of a linear layer that is chunked.
-    // Creating the gradients with the right layout saves us
-    // a `torch.cat` call in the backward pass
-    at::Tensor chunk;
-    if (use_fp32_qkv_grad)
-      chunk = at::empty({B, M, 3, Hq, K}, opts.dtype(at::kFloat));
-    else
-      chunk = at::empty({B, M, 3, Hq, K}, opts);
-    grad_q = chunk.select(2, 0);
-    grad_k = chunk.select(2, 1);
-    grad_v = chunk.select(2, 2);
-    grad_q.fill_(0);
-  } else if (
-      key.size(3) == value.size(3) &&
-      key.storage().is_alias_of(value.storage())) {
-    // Create one big contiguous chunk for grad_k, grad_v
-    // This is because k and v usually come from a single
-    // output of a linear layer that is chunked.
-    // Creating the gradients with the right layout saves us
-    // a `torch.cat` call in the backward pass
-    at::Tensor chunk;
-    if (use_fp32_qkv_grad)
-      chunk = at::empty({B, N, 2, Hkv, Kv}, opts.dtype(at::kFloat));
-    else
-      chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
-    grad_k = chunk.select(2, 0);
-    grad_v = chunk.select(2, 1);
-
-    if (use_fp32_qkv_grad)
-      grad_q = at::empty_strided(
-          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
-    else
-      grad_q =
-          at::empty_strided(query.sizes(), query.strides(), query.options());
-    grad_q.fill_(0);
-  } else {
-    if (use_fp32_qkv_grad) {
-      grad_q = at::empty_strided(
-          query.sizes(), query.strides(), query.options().dtype(at::kFloat));
-      grad_k = at::empty_strided(
-          key.sizes(), key.strides(), key.options().dtype(at::kFloat));
-      grad_v = at::empty_strided(
-          value.sizes(), value.strides(), value.options().dtype(at::kFloat));
-    } else {
-      grad_q =
-          at::empty_strided(query.sizes(), query.strides(), query.options());
-      grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
-      grad_v =
-          at::empty_strided(value.sizes(), value.strides(), value.options());
-    }
-    grad_q.fill_(0);
-  }
-
-  // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
-  TORCH_CHECK(query.sizes() == grad_q.sizes());
-  TORCH_CHECK(query.strides() == grad_q.strides());
-  TORCH_CHECK(key.sizes() == grad_k.sizes());
-  TORCH_CHECK(key.strides() == grad_k.strides());
-  TORCH_CHECK(value.sizes() == grad_v.sizes());
-  TORCH_CHECK(value.strides() == grad_v.strides());
-
-  const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
-
-  // even it is an output, the grad_bias is required to use the same data-type
-  // as bias in CK-FlashAttn
-  if (bias_requires_grad)
-    grad_bias =
-        at::empty_strided(bias->sizes(), bias->strides(), bias->options());
-
-  bool is_mqa_gqa = (Hq > Hkv);
-
-  at::Tensor tmp_grad_k, tmp_grad_v;
-
-  if (is_mqa_gqa) {
-    // allocate tmp_grad_k/tmp_grad_v which will be reduce to
-    // grad_k/grad_v for returning
-    if (use_fp32_qkv_grad) {
-      tmp_grad_k = at::empty({B, N, Hq, K}, opts.dtype(at::kFloat));
-      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts.dtype(at::kFloat));
-    } else {
-      tmp_grad_k = at::empty({B, N, Hq, K}, opts);
-      tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
-    }
-  }
-
-  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
-    p.is_mqa_gqa = is_mqa_gqa;
-
-    TORCH_CHECK(p.B == logsumexp.size(0));
-    TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.M == logsumexp.size(2));
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.grad_out_ptr = grad_out.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.grad_q_ptr = grad_q.data_ptr();
-    p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
-    p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(0)),
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (is_mqa_gqa) {
-      p.tmp_grad_k_strides = {
-          static_cast<int>(tmp_grad_k.stride(0)),
-          static_cast<int>(tmp_grad_k.stride(1)),
-          static_cast<int>(tmp_grad_k.stride(2)),
-          static_cast<int>(tmp_grad_k.stride(3))};
-      p.tmp_grad_v_strides = {
-          static_cast<int>(tmp_grad_v.stride(0)),
-          static_cast<int>(tmp_grad_v.stride(1)),
-          static_cast<int>(tmp_grad_v.stride(2)),
-          static_cast<int>(tmp_grad_v.stride(3))};
-    }
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-
-      if (bias_requires_grad)
-        p.grad_bias_ptr = grad_bias.data_ptr();
-    } else {
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = nullptr;
-      p.grad_bias_ptr = nullptr;
-    }
-
-    p.bias_has_grad = bias_requires_grad;
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.philox_seed = rng_seed;
-    p.philox_offset = rng_offset;
-
-    p.logsumexp_ptr = logsumexp.data_ptr();
-  };
-
-  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    p.use_fp32_qkv_grad = use_fp32_qkv_grad;
-    p.is_mqa_gqa = is_mqa_gqa;
-
-    p.max_seqlen_q = *max_seqlen_q_;
-
-    TORCH_CHECK(p.num_batches == logsumexp.size(0));
-    TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (is_mqa_gqa) {
-      p.tmp_grad_k_strides = {
-          static_cast<int>(tmp_grad_k.stride(1)),
-          static_cast<int>(tmp_grad_k.stride(2)),
-          static_cast<int>(tmp_grad_k.stride(3))};
-      p.tmp_grad_v_strides = {
-          static_cast<int>(tmp_grad_v.stride(1)),
-          static_cast<int>(tmp_grad_v.stride(2)),
-          static_cast<int>(tmp_grad_v.stride(3))};
-    };
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.bias_has_grad = bias_requires_grad;
-
-    p.dropout_prob = static_cast<float>(dropout_p);
-    p.philox_seed = rng_seed;
-    p.philox_offset = rng_offset;
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
-
-    for (int i = 0; i < p.host_seqstart_q.size(); i++)
-      p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
-
-    for (int i = 0; i < p.host_seqstart_k.size(); i++)
-      p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
-
-    if (seqlen_k.has_value()) {
-      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-      TORCH_CHECK(seqlen_k->dim() == 1);
-      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
-
-      p.host_seqlen_k.resize(p.num_batches);
-
-      for (int i = 0; i < p.host_seqlen_k.size(); i++)
-        p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
-    }
-
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* grad_out_ptr = reinterpret_cast<char*>(grad_out.data_ptr());
-    char* attn_bias_ptr =
-        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-    char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-    char* grad_q_ptr = reinterpret_cast<char*>(grad_q.data_ptr());
-    char* grad_k_ptr = is_mqa_gqa
-        ? reinterpret_cast<char*>(tmp_grad_k.data_ptr())
-        : reinterpret_cast<char*>(grad_k.data_ptr());
-    char* grad_v_ptr = is_mqa_gqa
-        ? reinterpret_cast<char*>(tmp_grad_v.data_ptr())
-        : reinterpret_cast<char*>(grad_v.data_ptr());
-    char* grad_bias_ptr = bias_requires_grad
-        ? reinterpret_cast<char*>(grad_bias.data_ptr())
-        : nullptr;
-
-    size_t multiplier = 1;
-
-    if (p.use_fp32_qkv_grad)
-      multiplier = get_size_in_bytes(1, at::ScalarType::Float) /
-          get_size_in_bytes(1, query.scalar_type());
-
-    std::cout << "qkv-grad precision multiplier is " << multiplier << std::endl;
-
-    for (int i = 0; i < p.num_batches; i++) {
-      size_t tmp_q_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
-          query.scalar_type());
-      size_t tmp_k_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
-          key.scalar_type());
-      size_t tmp_v_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
-          value.scalar_type());
-      size_t tmp_o_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
-          out.scalar_type());
-      size_t tmp_logsumexp_offset = get_size_in_bytes(
-          static_cast<size_t>(i) * p.Hq * p.max_seqlen_q,
-          logsumexp.scalar_type());
-
-      size_t tmp_grad_k_offset = is_mqa_gqa
-          ? get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.tmp_grad_k_strides[0],
-                tmp_grad_k.scalar_type())
-          : tmp_k_offset;
-      size_t tmp_grad_v_offset = is_mqa_gqa
-          ? get_size_in_bytes(
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.tmp_grad_v_strides[0],
-                tmp_grad_v.scalar_type())
-          : tmp_v_offset;
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-      p.grad_q_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_q_ptr[tmp_q_offset * multiplier]));
-
-      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-      p.grad_k_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_k_ptr[tmp_grad_k_offset * multiplier]));
-
-      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-      p.grad_v_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_v_ptr[tmp_grad_v_offset * multiplier]));
-
-      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-      p.grad_out_ptrs.push_back(
-          reinterpret_cast<void*>(&grad_out_ptr[tmp_o_offset]));
-
-      p.logsumexp_ptrs.push_back(
-          reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-
-      if (bias.has_value()) {
-        size_t tmp_bias_offset = get_size_in_bytes(
-            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(
-            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-
-        if (bias_requires_grad) {
-          p.grad_bias_ptrs.push_back(
-              reinterpret_cast<void*>(&grad_bias_ptr[tmp_bias_offset]));
-        }
-      }
-
-      // ToDO: remove this after dev-op fix
-      p.randvals_ptrs.push_back(nullptr);
-    }
-  };
-
-  auto inDataType = query.scalar_type();
-
-  if (!seqstart_q.has_value()) { // input is batched
-    BatchedBackwardParams batched_backward_params;
-
-    set_batched_backward_params(batched_backward_params);
-
-    if (inDataType == at::ScalarType::Half) {
-      batched_backward_fp16(batched_backward_params, stream);
-    } else if (inDataType == at::ScalarType::BFloat16) {
-      batched_backward_bp16(batched_backward_params, stream);
-    } else
-      throw std::runtime_error("input data-type is not supported");
-  } else { // input is grouped
-    GroupedBackwardParams grouped_backward_params;
-
-    set_grouped_backward_params(grouped_backward_params);
-
-    if (inDataType == at::ScalarType::Half) {
-      grouped_backward_fp16(grouped_backward_params, stream);
-    } else if (inDataType == at::ScalarType::BFloat16) {
-      grouped_backward_bp16(grouped_backward_params, stream);
-    } else
-      throw std::runtime_error("input data-type is not supported");
-  }
-
-  if (is_mqa_gqa) {
-    auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
-    auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
-    grad_k = tmp_grad_k_view.sum(3);
-    grad_v = tmp_grad_v_view.sum(3);
-  }
-
-  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
-#endif
-} // namespace
-
-} // namespace
-
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
-      TORCH_FN(efficient_attention_backward_ck));
-}
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
deleted file mode 100644
index ecf73c09b..000000000
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <c10/core/TensorOptions.h>
-#include <torch/library.h>
-#include <torch/types.h>
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include "ck/tensor_operation/gpu/device/impl/device_batched_dropout.hpp"
-
-#include "ck_fmha_util.h"
-
-namespace {
-
-/**
- * generate a tensor with random uniform values. only used for testing, not much
- * attention is paid to performance
- */
-at::Tensor rand_uniform_int(
-    double dropout_prob,
-    const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
-{
-  int B = out_pattern.size(0);
-  int num_heads = out_pattern.size(1);
-  int M = out_pattern.size(2);
-  int N = out_pattern.size(3);
-
-  // at::cuda::CUDAGuard device_guard(out_pattern.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  at::CUDAGeneratorImpl* gen =
-      at::get_generator_or_default<at::CUDAGeneratorImpl>(
-          c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-  at::PhiloxCudaState rng_engine_inputs;
-  {
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
-  }
-
-  const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-  int64_t philox_seed = std::get<0>(seeds);
-  int64_t philox_offset = std::get<1>(seeds);
-
-  at::Tensor randvals;
-
-  randvals = at::empty(
-      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-
-  using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedDropout<
-      2, // NumDimG
-      ck::half_t,
-      int,
-      ck::half_t,
-      GemmSpec,
-      TensorSpecA,
-      TensorSpecB0,
-      TensorSpecB1,
-      TensorSpecC,
-      256, // BlockSize
-      64, // MPerBlock
-      128, // NPerBlock
-      32, // KPerBlock
-      8, // AK1
-      8, // BK1
-      32, // MPerXDL
-      32, // NPerXDL
-      2, // MXdlPerWave
-      1>; // NXdlPerWave
-
-  const uint64_t seed = 1;
-  const uint64_t offset = 0;
-
-  std::vector<ck::index_t> z_gs_ms_ns_lengths = {B, num_heads, M, N};
-  std::vector<ck::index_t> z_gs_ms_ns_strides = {
-      static_cast<int>(randvals.stride(0)),
-      static_cast<int>(randvals.stride(1)),
-      static_cast<int>(randvals.stride(2)),
-      static_cast<int>(randvals.stride(3))};
-
-  auto dropout_op = DeviceOpInstance();
-  auto dropout_invoker = dropout_op.MakeInvoker();
-
-  auto dropout_arg = dropout_op.MakeArgument(
-      static_cast<int*>(randvals.data_ptr()),
-      z_gs_ms_ns_lengths,
-      z_gs_ms_ns_strides,
-      {philox_seed, philox_offset});
-
-  dropout_invoker.Run(dropout_arg, StreamConfig{stream, false});
-  (void)hipStreamSynchronize(stream);
-
-  return randvals;
-} // namespace
-
-} // namespace
-
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"),
-      TORCH_FN(rand_uniform_int));
-}
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
deleted file mode 100644
index 5060b03c8..000000000
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <cmath>
-#include <mutex>
-#include <tuple>
-
-#include <ATen/Context.h>
-#include <ATen/ScalarOps.h>
-#include <ATen/Tensor.h>
-#include <ATen/core/Generator.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/util/Optional.h>
-#include <torch/library.h>
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
-
-#include <ck/host_utility/hip_check_error.hpp>
-
-#include "ck_fmha_params.h"
-#include "ck_fmha_util.h"
-
-extern void batched_forward_fp16(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-extern void batched_forward_bp16(
-    BatchedForwardParams& param,
-    hipStream_t stream);
-extern void grouped_forward_fp16(
-    GroupedForwardParams& param,
-    hipStream_t stream);
-extern void grouped_forward_bp16(
-    GroupedForwardParams& param,
-    hipStream_t stream);
-
-extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
-extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
-extern void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream);
-extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream);
-
-namespace {
-
-/*
-  There are 2 modes for using this function.
-  (Mode BMHK) With all the heads having the same seqlen
-  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
-*/
-std::tuple<at::Tensor, at::Tensor, int64_t, int64_t>
-efficient_attention_forward_ck(
-    const at::Tensor& query, // [b, seqlen, num_heads_q, K]
-    const at::Tensor& key, // [b, seqlen, num_heads_kv, K]
-    const at::Tensor& value, // [b, seqlen, num_heads_kv, Kv]
-    const c10::optional<at::Tensor>& bias, // [b, num_heads_q, seqlen, seqlen]
-    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
-    // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& seqstart_q,
-    // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
-    // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& seqstart_k,
-    // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
-    double dropout_p, // attention matrix dropout probability
-    bool compute_logsumexp,
-    int64_t custom_mask_type,
-    c10::optional<double> scale,
-    const c10::optional<at::Tensor>& seqlen_k,
-    const c10::optional<int64_t> window_size) {
-  std::ignore = window_size;
-
-  TORCH_CHECK(query.dim() == 4);
-  TORCH_CHECK(key.dim() == 4);
-  TORCH_CHECK(value.dim() == 4);
-
-  // Batch sizes
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // Sequence length
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) % key.size(2) == 0);
-  TORCH_CHECK(key.size(2) == value.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-
-  TORCH_CHECK(query.scalar_type() == key.scalar_type());
-  TORCH_CHECK(query.scalar_type() == value.scalar_type());
-
-  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
-  if (seqstart_q.has_value()) {
-    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqstart_k));
-    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-  };
-
-  // last dim is contiguous, device is kCUDA
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  // at::cuda::CUDAGuard device_guard(query.device());
-  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t Hq = query.size(-2);
-  int64_t Hkv = key.size(-2);
-  int64_t K = query.size(-1);
-  int64_t Kv = value.size(-1);
-
-  auto opts = query.options();
-
-  at::Tensor logsumexp;
-
-  at::Tensor out = at::empty({B, M, Hq, Kv}, opts);
-
-  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  if (use_dropout) {
-    at::PhiloxCudaState rng_engine_inputs;
-    at::CUDAGeneratorImpl* gen =
-        at::get_generator_or_default<at::CUDAGeneratorImpl>(
-            c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    // if using dropout, we produce 1 random number for each element of the
-    // attention tensor
-    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
-
-    const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
-
-    philox_seed = std::get<0>(seeds);
-    philox_offset = std::get<1>(seeds);
-  }
-
-  auto set_batched_forward_params = [&](BatchedForwardParams& p) {
-    p.B = B;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_ptr = query.data_ptr();
-    p.k_ptr = key.data_ptr();
-    p.v_ptr = value.data_ptr();
-    p.out_ptr = out.data_ptr();
-
-    p.q_strides = {
-        static_cast<int>(query.stride(0)),
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(0)),
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(0)),
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(0)),
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      p.attn_bias_ptr = bias->data_ptr();
-
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.custom_mask_type = custom_mask_type;
-
-    p.use_dropout = use_dropout;
-    p.philox_seed = philox_seed;
-    p.philox_offset = philox_offset;
-    p.compute_logsumexp = compute_logsumexp;
-
-    // the following parameters are only used by training forward
-    if (p.use_dropout)
-      p.dropout_prob = static_cast<float>(dropout_p);
-    else
-      p.dropout_prob = 0.0f;
-
-    if (p.compute_logsumexp) {
-      logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
-      p.logsumexp_ptr = logsumexp.data_ptr();
-    } else
-      p.logsumexp_ptr = nullptr;
-  };
-
-  auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
-    p.num_batches = seqstart_q->size(0) - 1;
-    p.M = M;
-    p.N = N;
-    p.Hq = Hq;
-    p.Hkv = Hkv;
-    p.K = K;
-    p.Kv = Kv;
-
-    if (scale.has_value()) {
-      p.scale = float(*scale);
-    } else {
-      p.scale = float(1.0 / std::sqrt(float(K)));
-    }
-
-    p.q_strides = {
-        static_cast<int>(query.stride(1)),
-        static_cast<int>(query.stride(2)),
-        static_cast<int>(query.stride(3))};
-    p.k_strides = {
-        static_cast<int>(key.stride(1)),
-        static_cast<int>(key.stride(2)),
-        static_cast<int>(key.stride(3))};
-    p.v_strides = {
-        static_cast<int>(value.stride(1)),
-        static_cast<int>(value.stride(2)),
-        static_cast<int>(value.stride(3))};
-    p.out_strides = {
-        static_cast<int>(out.stride(1)),
-        static_cast<int>(out.stride(2)),
-        static_cast<int>(out.stride(3))};
-
-    if (bias.has_value()) {
-      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
-      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
-
-      p.has_attn_bias = true;
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
-      p.attn_bias_strides = {
-          static_cast<int>(bias_4d_view.stride(0)),
-          static_cast<int>(bias_4d_view.stride(1)),
-          static_cast<int>(bias_4d_view.stride(2)),
-          static_cast<int>(bias_4d_view.stride(3))};
-    } else
-      p.has_attn_bias = false;
-
-    p.custom_mask_type = custom_mask_type;
-
-    // max_seqlen_q is used to create logsumexp tensor
-    p.max_seqlen_q = *max_seqlen_q_;
-
-    p.host_seqstart_q.resize(p.num_batches + 1);
-    p.host_seqstart_k.resize(p.num_batches + 1);
-
-    for (int i = 0; i < p.host_seqstart_q.size(); i++)
-      p.host_seqstart_q[i] =
-          *(reinterpret_cast<int*>(seqstart_q->data_ptr()) + i);
-
-    for (int i = 0; i < p.host_seqstart_k.size(); i++)
-      p.host_seqstart_k[i] =
-          *(reinterpret_cast<int*>(seqstart_k->data_ptr()) + i);
-
-    if (seqlen_k.has_value()) {
-      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-      TORCH_CHECK(seqlen_k->dim() == 1);
-      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
-      CHECK_NOSPARSE_CONTIGUOUS_CPU((*seqlen_k));
-
-      p.host_seqlen_k.resize(p.num_batches);
-
-      for (int i = 0; i < p.host_seqlen_k.size(); i++)
-        p.host_seqlen_k[i] =
-            *(reinterpret_cast<int*>(seqlen_k->data_ptr()) + i);
-    }
-
-    char* q_ptr = reinterpret_cast<char*>(query.data_ptr());
-    char* k_ptr = reinterpret_cast<char*>(key.data_ptr());
-    char* v_ptr = reinterpret_cast<char*>(value.data_ptr());
-
-    char* out_ptr = reinterpret_cast<char*>(out.data_ptr());
-    char* attn_bias_ptr =
-        bias.has_value() ? reinterpret_cast<char*>(bias->data_ptr()) : nullptr;
-
-    for (int i = 0; i < p.num_batches; i++) {
-      size_t tmp_q_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.q_strides[0],
-          query.scalar_type());
-      size_t tmp_k_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.k_strides[0],
-          key.scalar_type());
-      size_t tmp_v_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_k[i]) * p.v_strides[0],
-          value.scalar_type());
-      size_t tmp_o_offset = get_size_in_bytes(
-          static_cast<size_t>(p.host_seqstart_q[i]) * p.out_strides[0],
-          out.scalar_type());
-
-      p.q_ptrs.push_back(reinterpret_cast<void*>(&q_ptr[tmp_q_offset]));
-      p.k_ptrs.push_back(reinterpret_cast<void*>(&k_ptr[tmp_k_offset]));
-      p.v_ptrs.push_back(reinterpret_cast<void*>(&v_ptr[tmp_v_offset]));
-      p.out_ptrs.push_back(reinterpret_cast<void*>(&out_ptr[tmp_o_offset]));
-
-      if (bias.has_value()) {
-        size_t tmp_bias_offset = get_size_in_bytes(
-            static_cast<size_t>(p.host_seqstart_q[i]) * p.attn_bias_strides[2] +
-                static_cast<size_t>(p.host_seqstart_k[i]) *
-                    p.attn_bias_strides[3],
-            bias->scalar_type());
-
-        p.attn_bias_ptrs.push_back(
-            reinterpret_cast<void*>(&attn_bias_ptr[tmp_bias_offset]));
-      };
-
-      // ToDO: remove this after dev-op fix
-      p.randvals_ptrs.push_back(nullptr);
-    }
-
-    p.use_dropout = use_dropout;
-    p.philox_seed = philox_seed;
-    p.philox_offset = philox_offset;
-    p.compute_logsumexp = compute_logsumexp;
-
-    // the following parameters are only used by training forward
-    if (p.use_dropout)
-      p.dropout_prob = static_cast<float>(dropout_p);
-    else
-      p.dropout_prob = 0.0f;
-
-    if (p.compute_logsumexp) {
-      logsumexp = at::empty(
-          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
-      char* logsumexp_ptr = reinterpret_cast<char*>(logsumexp.data_ptr());
-
-      for (int i = 0; i < p.num_batches; i++) {
-        size_t tmp_logsumexp_offset = get_size_in_bytes(
-            static_cast<size_t>(i) * Hq * p.max_seqlen_q,
-            logsumexp.scalar_type());
-        p.logsumexp_ptrs.push_back(
-            reinterpret_cast<void*>(&logsumexp_ptr[tmp_logsumexp_offset]));
-      };
-    };
-  };
-
-  auto inDataType = query.scalar_type();
-
-  if (!seqstart_q.has_value()) { // input is batched
-    BatchedForwardParams batched_forward_params;
-
-    set_batched_forward_params(batched_forward_params);
-
-    if (!batched_forward_params.use_dropout &&
-        !batched_forward_params.compute_logsumexp) {
-      if (inDataType == at::ScalarType::Half) {
-        batched_infer_fp16(batched_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_infer_bp16(batched_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else {
-      if (inDataType == at::ScalarType::Half) {
-        batched_forward_fp16(batched_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_forward_bp16(batched_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    };
-  } else { // input is grouped
-    GroupedForwardParams grouped_forward_params;
-
-    set_grouped_forward_params(grouped_forward_params);
-
-    if (!grouped_forward_params.use_dropout &&
-        !grouped_forward_params.compute_logsumexp) {
-      if (inDataType == at::ScalarType::Half) {
-        grouped_infer_fp16(grouped_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_infer_bp16(grouped_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    } else {
-      if (inDataType == at::ScalarType::Half) {
-        grouped_forward_fp16(grouped_forward_params, stream);
-      } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_forward_bp16(grouped_forward_params, stream);
-      } else
-        throw std::runtime_error("input data-type is not supported!");
-    };
-  };
-
-  return std::make_tuple(out, logsumexp, philox_seed, philox_offset);
-}
-
-} // namespace
-
-TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),
-      TORCH_FN(efficient_attention_forward_ck));
-}
diff --git a/xformers/csrc/attention/hip_fmha/ck_align_switch.h b/xformers/csrc/attention/hip_fmha/ck_align_switch.h
deleted file mode 100644
index 9e7228355..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_align_switch.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <stdexcept>
-
-// assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_1(CONST_ALIGN_MAX1, CONST_ALIGN_NAME1, LENGTH1, ...)   \
-  [&] {                                                                     \
-    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
-      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
-        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
-        __VA_ARGS__();                                                      \
-      } else {                                                              \
-        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
-          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
-            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
-            __VA_ARGS__();                                                  \
-          } else {                                                          \
-            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
-              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
-                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
-                    CONST_ALIGN_MAX1 / 4;                                   \
-                __VA_ARGS__();                                              \
-              } else {                                                      \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
-                __VA_ARGS__();                                              \
-              };                                                            \
-            }                                                               \
-          };                                                                \
-        }                                                                   \
-      };                                                                    \
-    }                                                                       \
-  }()
-
-// assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_2(                                                       \
-    CONST_ALIGN_MAX1,                                                         \
-    CONST_ALIGN_NAME1,                                                        \
-    LENGTH1,                                                                  \
-    CONST_ALIGN_MAX2,                                                         \
-    CONST_ALIGN_NAME2,                                                        \
-    LENGTH2,                                                                  \
-    ...)                                                                      \
-  [&] {                                                                       \
-    if constexpr (CONST_ALIGN_MAX1 > 0) {                                     \
-      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                  \
-        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;           \
-        ALIGN_SWITCH_1(                                                       \
-            CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__);     \
-      } else {                                                                \
-        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                             \
-          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                        \
-            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2;   \
-            ALIGN_SWITCH_1(                                                   \
-                CONST_ALIGN_MAX2, CONST_ALIGN_NAME2, LENGTH2, ##__VA_ARGS__); \
-          } else {                                                            \
-            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                         \
-              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                    \
-                constexpr ck::index_t CONST_ALIGN_NAME1 =                     \
-                    CONST_ALIGN_MAX1 / 4;                                     \
-                ALIGN_SWITCH_1(                                               \
-                    CONST_ALIGN_MAX2,                                         \
-                    CONST_ALIGN_NAME2,                                        \
-                    LENGTH2,                                                  \
-                    ##__VA_ARGS__);                                           \
-              } else {                                                        \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                  \
-                ALIGN_SWITCH_1(                                               \
-                    CONST_ALIGN_MAX2,                                         \
-                    CONST_ALIGN_NAME2,                                        \
-                    LENGTH2,                                                  \
-                    ##__VA_ARGS__);                                           \
-              };                                                              \
-            }                                                                 \
-          };                                                                  \
-        }                                                                     \
-      };                                                                      \
-    }                                                                         \
-  }()
-
-// assume the maximum alignment is 8 elements
-#define ALIGN_SWITCH_3(                                                     \
-    CONST_ALIGN_MAX1,                                                       \
-    CONST_ALIGN_NAME1,                                                      \
-    LENGTH1,                                                                \
-    CONST_ALIGN_MAX2,                                                       \
-    CONST_ALIGN_NAME2,                                                      \
-    LENGTH2,                                                                \
-    CONST_ALIGN_MAX3,                                                       \
-    CONST_ALIGN_NAME3,                                                      \
-    LENGTH3,                                                                \
-    ...)                                                                    \
-  [&] {                                                                     \
-    if constexpr (CONST_ALIGN_MAX1 > 0) {                                   \
-      if (LENGTH1 % CONST_ALIGN_MAX1 == 0) {                                \
-        constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1;         \
-        ALIGN_SWITCH_2(                                                     \
-            CONST_ALIGN_MAX2,                                               \
-            CONST_ALIGN_NAME2,                                              \
-            LENGTH2,                                                        \
-            CONST_ALIGN_MAX3,                                               \
-            CONST_ALIGN_NAME3,                                              \
-            LENGTH3,                                                        \
-            ##__VA_ARGS__);                                                 \
-      } else {                                                              \
-        if constexpr (CONST_ALIGN_MAX1 / 2 > 0) {                           \
-          if (LENGTH1 % (CONST_ALIGN_MAX1 / 2) == 0) {                      \
-            constexpr ck::index_t CONST_ALIGN_NAME1 = CONST_ALIGN_MAX1 / 2; \
-            ALIGN_SWITCH_2(                                                 \
-                CONST_ALIGN_MAX2,                                           \
-                CONST_ALIGN_NAME2,                                          \
-                LENGTH2,                                                    \
-                CONST_ALIGN_MAX3,                                           \
-                CONST_ALIGN_NAME3,                                          \
-                LENGTH3,                                                    \
-                ##__VA_ARGS__);                                             \
-          } else {                                                          \
-            if constexpr (CONST_ALIGN_MAX1 / 4 > 0) {                       \
-              if (LENGTH1 % (CONST_ALIGN_MAX1 / 4) == 0) {                  \
-                constexpr ck::index_t CONST_ALIGN_NAME1 =                   \
-                    CONST_ALIGN_MAX1 / 4;                                   \
-                ALIGN_SWITCH_2(                                             \
-                    CONST_ALIGN_MAX2,                                       \
-                    CONST_ALIGN_NAME2,                                      \
-                    LENGTH2,                                                \
-                    CONST_ALIGN_MAX3,                                       \
-                    CONST_ALIGN_NAME3,                                      \
-                    LENGTH3,                                                \
-                    ##__VA_ARGS__);                                         \
-              } else {                                                      \
-                constexpr ck::index_t CONST_ALIGN_NAME1 = 1;                \
-                ALIGN_SWITCH_2(                                             \
-                    CONST_ALIGN_MAX2,                                       \
-                    CONST_ALIGN_NAME2,                                      \
-                    LENGTH2,                                                \
-                    CONST_ALIGN_MAX3,                                       \
-                    CONST_ALIGN_NAME3,                                      \
-                    LENGTH3,                                                \
-                    ##__VA_ARGS__);                                         \
-              };                                                            \
-            }                                                               \
-          };                                                                \
-        }                                                                   \
-      };                                                                    \
-    }                                                                       \
-  }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_bool_switch.h b/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
deleted file mode 100644
index 1a062d3e3..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_bool_switch.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#define BOOL_SWITCH_1(COND1, CONST_NAME1, ...) \
-  [&] {                                        \
-    if (COND1) {                               \
-      constexpr bool CONST_NAME1 = true;       \
-      __VA_ARGS__();                           \
-    } else {                                   \
-      constexpr bool CONST_NAME1 = false;      \
-      __VA_ARGS__();                           \
-    }                                          \
-  }()
-
-#define BOOL_SWITCH_2(COND1, CONST_NAME1, COND2, CONST_NAME2, ...) \
-  [&] {                                                            \
-    if (COND1) {                                                   \
-      constexpr bool CONST_NAME1 = true;                           \
-      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);            \
-    } else {                                                       \
-      constexpr bool CONST_NAME1 = false;                          \
-      BOOL_SWITCH_1(COND2, CONST_NAME2, ##__VA_ARGS__);            \
-    }                                                              \
-  }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
deleted file mode 100644
index 49122fd74..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_backward_gemm_constants.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <ck/ck.hpp>
-#include "ck_fmha_op_helper.h"
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedBackward_V1 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  // static constexpr ck::index_t KPerBlock;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 4;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-struct GemmOpConstantsBatchedBackward_V2 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 64;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 128;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 64;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 2;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedBackward_V1 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  // static constexpr ck::index_t KPerBlock;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 4;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-struct GemmOpConstantsGroupedBackward_V2 {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 64;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 128;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t Gemm2KPerBlock = 64;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 2;
-  static constexpr ck::index_t NXdlPerWave = 1;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t Gemm2NXdlPerWave = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 8;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 8;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<8, 32, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  // using
-  // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-  // static constexpr ck::index_t
-  // CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
deleted file mode 100644
index d0cccf2b3..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward.h
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-
-#include "ck_align_switch.h"
-#include "ck_fmha_backward_gemm_constants.h"
-#include "ck_fmha_common_gemm_constants.h"
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_params.h"
-
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-struct batched_backward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-  using Scale = ck::tensor_operation::element_wise::Scale;
-
-  using QKVElementOp = PassThrough;
-  using YElementOp = PassThrough;
-
-  using InputDataType = scalar_t;
-  using OutputDataType =
-      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
-  using GemmDataType = scalar_t;
-  using AccDataType = F32;
-  using ShuffleDataType = F32;
-  using LSEDataType = F32;
-  using ZDataType = unsigned short;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr bool Deterministic = true;
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-#ifndef BATCHED_BACKWARD_V1_HEADDIM_SWITCH
-#define BATCHED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                               \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
-      __VA_ARGS__();                                                  \
-    } else {                                                          \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
-      __VA_ARGS__();                                                  \
-    };                                                                \
-  }()
-#endif
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      typename kCShuffleBlockTransferClusterLengths,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp_V1 = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          InputDataType,
-          OutputDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          ShuffleDataType,
-          QKVElementOp,
-          QKVElementOp,
-          Scale,
-          QKVElementOp,
-          YElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsBatchedBackward_V1::NumGemmKPrefetchStage,
-          GemmOpConstantsBatchedBackward_V1::BlockSize,
-          GemmOpConstantsBatchedBackward_V1::MPerBlock,
-          GemmOpConstantsBatchedBackward_V1::NPerBlock,
-          kGemm1NPerBlock, // KPerBlock == kGemm1NPerBlock required
-          kGemm1NPerBlock,
-          GemmOpConstantsBatchedBackward_V1::Gemm1KPerBlock,
-          GemmOpConstantsBatchedBackward_V1::Gemm2KPerBlock,
-          GemmOpConstantsBatchedBackward_V1::AK1,
-          GemmOpConstantsBatchedBackward_V1::BK1,
-          GemmOpConstantsBatchedBackward_V1::B1K1,
-          GemmOpConstantsBatchedBackward_V1::MPerXDL,
-          GemmOpConstantsBatchedBackward_V1::NPerXDL,
-          GemmOpConstantsBatchedBackward_V1::MXdlPerWave,
-          GemmOpConstantsBatchedBackward_V1::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsBatchedBackward_V1::Gemm2NXdlPerWave,
-          GemmOpConstantsBatchedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsBatchedBackward_V1::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedBackward_V1::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedBackward_V1::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V1::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsBatchedBackward_V1::ABlockLdsExtraM,
-          GemmOpConstantsBatchedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedBackward_V1::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedBackward_V1::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedBackward_V1::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V1::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedBackward_V1::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V1::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths,
-          kCShuffleBlockTransferScalarPerVector,
-          MaskingSpec,
-          Deterministic>;
-  // clang-format on
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      typename kCShuffleBlockTransferClusterLengths,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kB1BlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp_V2 = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          InputDataType,
-          OutputDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          ShuffleDataType,
-          QKVElementOp,
-          QKVElementOp,
-          Scale,
-          QKVElementOp,
-          YElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsBatchedBackward_V2::NumGemmKPrefetchStage,
-          GemmOpConstantsBatchedBackward_V2::BlockSize,
-          GemmOpConstantsBatchedBackward_V2::MPerBlock,
-          GemmOpConstantsBatchedBackward_V2::NPerBlock,
-          GemmOpConstantsBatchedBackward_V2::KPerBlock,
-          kGemm1NPerBlock,
-          GemmOpConstantsBatchedBackward_V2::Gemm1KPerBlock,
-          GemmOpConstantsBatchedBackward_V2::Gemm2KPerBlock,
-          GemmOpConstantsBatchedBackward_V2::AK1,
-          GemmOpConstantsBatchedBackward_V2::BK1,
-          GemmOpConstantsBatchedBackward_V2::B1K1,
-          GemmOpConstantsBatchedBackward_V2::MPerXDL,
-          GemmOpConstantsBatchedBackward_V2::NPerXDL,
-          GemmOpConstantsBatchedBackward_V2::MXdlPerWave,
-          GemmOpConstantsBatchedBackward_V2::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsBatchedBackward_V2::Gemm2NXdlPerWave,
-          GemmOpConstantsBatchedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsBatchedBackward_V2::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedBackward_V2::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedBackward_V2::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V2::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsBatchedBackward_V2::ABlockLdsExtraM,
-          GemmOpConstantsBatchedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedBackward_V2::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedBackward_V2::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedBackward_V2::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V2::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedBackward_V2::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V2::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedBackward_V2::B1BlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedBackward_V2::B1BlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedBackward_V2::B1BlockTransferSrcVectorDim,
-          kB1BlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedBackward_V2::B1BlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedBackward_V2::B1BlockLdsExtraN,
-          GemmOpConstantsBatchedBackward_V2::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths,
-          kCShuffleBlockTransferScalarPerVector,
-          MaskingSpec,
-          Deterministic>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    if (param.K <= 64 && param.Kv <= 64) {
-      // compile-time constants which don't depend on head-dim switching
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedBackward_V1::AK1 /
-          GemmOpConstantsBatchedBackward_V1::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedBackward_V1::BK1 /
-          GemmOpConstantsBatchedBackward_V1::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-          "and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      BATCHED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-        constexpr ck::index_t thread_slice_length_cshuflle_n =
-            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-             kGemm1NXdlPerWave) /
-            kCShuffleBlockTransferClusterLengths::At(I3);
-
-        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-            min(2, thread_slice_length_cshuflle_n);
-
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      });
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsBatchedBackward_V2::AK1 /
-          GemmOpConstantsBatchedBackward_V2::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsBatchedBackward_V2::BK1 /
-          GemmOpConstantsBatchedBackward_V2::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-          "and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedBackward_V2::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          kCShuffleBlockTransferClusterLengths::At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-
-              static_assert(
-                  kB1BlockTransferSrcScalarPerVector > 0,
-                  "kB1BlockTransferSrcScalarPerVector must be positive");
-
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-
-              static_assert(
-                  kB1BlockTransferSrcScalarPerVector > 0,
-                  "kB1BlockTransferSrcScalarPerVector must be positive");
-
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    };
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(
-      BatchedBackwardParams& param,
-      hipStream_t stream) {
-    std::vector<ck::index_t> q_gs_ms_ks_lengths{
-        param.B, param.Hq, param.M, param.K};
-    std::vector<ck::index_t> q_gs_ms_ks_strides{
-        param.q_strides[0],
-        param.q_strides[2],
-        param.q_strides[1],
-        param.q_strides[3]};
-
-    std::vector<ck::index_t> k_gs_ns_ks_lengths{
-        param.B, param.Hkv, param.N, param.K};
-    std::vector<ck::index_t> k_gs_ns_ks_strides{
-        param.k_strides[0],
-        param.k_strides[2],
-        param.k_strides[1],
-        param.k_strides[3]};
-
-    std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {
-        param.B, param.Hq, param.N, param.K};
-    std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
-        param.tmp_grad_k_strides[0],
-        param.tmp_grad_k_strides[2],
-        param.tmp_grad_k_strides[1],
-        param.tmp_grad_k_strides[3]};
-
-    std::vector<ck::index_t> v_gs_os_ns_lengths{
-        param.B, param.Hkv, param.Kv, param.N};
-    std::vector<ck::index_t> v_gs_os_ns_strides{
-        param.v_strides[0],
-        param.v_strides[2],
-        param.v_strides[3],
-        param.v_strides[1]};
-
-    std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {
-        param.B, param.Hq, param.Kv, param.N};
-    std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
-        param.tmp_grad_v_strides[0],
-        param.tmp_grad_v_strides[2],
-        param.tmp_grad_v_strides[3],
-        param.tmp_grad_v_strides[1]};
-
-    std::vector<ck::index_t> y_gs_ms_os_lengths{
-        param.B, param.Hq, param.M, param.Kv};
-    std::vector<ck::index_t> y_gs_ms_os_strides{
-        param.out_strides[0],
-        param.out_strides[2],
-        param.out_strides[1],
-        param.out_strides[3]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-      d_gs_ms_ns_strides = {
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
-    };
-
-    float alpha = param.scale;
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptr,
-        param.k_ptr,
-        nullptr, // p_z_grid
-        param.v_ptr,
-        param.out_ptr,
-        param.logsumexp_ptr,
-        param.grad_out_ptr,
-        param.grad_q_ptr,
-        param.grad_k_ptr,
-        param.grad_v_ptr,
-        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-        nullptr, // p_acc1_bias
-        param.bias_has_grad ? param.grad_bias_ptr : nullptr,
-        nullptr,
-        q_gs_ms_ks_lengths, // q, dQ should have same shape
-        q_gs_ms_ks_strides,
-        k_gs_ns_ks_lengths, // k, dK should have same shape
-        k_gs_ns_ks_strides,
-        {1, 1, 1, 1}, // z_gs_ms_ns_lengths
-        {0, 0, 0, 0}, // z_gs_ms_ns_strides
-        v_gs_os_ns_lengths, // v, dV should have same shape
-        v_gs_os_ns_strides,
-        y_gs_ms_os_lengths, // y, dY should have same shape
-        y_gs_ms_os_strides,
-        lse_gs_ms_lengths,
-        param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
-        param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
-        param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
-        param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
-        d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides
-        QKVElementOp{},
-        QKVElementOp{},
-        Scale{alpha},
-        QKVElementOp{},
-        YElementOp{},
-        param.dropout_prob,
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
-};
-
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-void run_batched_backward_masktype_attnbias_dispatched(
-    BatchedBackwardParams& param,
-    hipStream_t stream) {
-  batched_backward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias,
-      use_fp32_qkv_grad>::Run(param, stream);
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
deleted file mode 100644
index 4a589ae02..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_bp16.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_batched_backward.h"
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 1)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 2)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
deleted file mode 100644
index b218809be..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_backward_fp16.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_batched_backward.h"
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
-
-void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 1)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else if (param.custom_mask_type == 2)
-          run_batched_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
deleted file mode 100644
index f96a52d56..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/math.hpp>
-#include <ck/utility/number.hpp>
-
-#include "ck_align_switch.h"
-#include "ck_fmha_common_gemm_constants.h"
-#include "ck_fmha_forward_gemm_constants.h"
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_params.h"
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_forward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-#ifndef BATCHED_FORWARD_HEADDIM_SWITCH
-#define BATCHED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                           \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-      __VA_ARGS__();                                              \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-      __VA_ARGS__();                                              \
-    } else {                                                      \
-      constexpr ck::index_t kGemm1NPerBlock = 128;                \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-      __VA_ARGS__();                                              \
-    }                                                             \
-  }()
-#endif
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kB1BlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsBatchedForward::NumGemmKPrefetchStage,
-          GemmOpConstantsBatchedForward::BlockSize,
-          GemmOpConstantsBatchedForward::MPerBlock,
-          GemmOpConstantsBatchedForward::NPerBlock,
-          GemmOpConstantsBatchedForward::KPerBlock,
-          kGemm1NPerBlock,
-          GemmOpConstantsBatchedForward::Gemm1KPerBlock,
-          GemmOpConstantsBatchedForward::AK1,
-          GemmOpConstantsBatchedForward::BK1,
-          GemmOpConstantsBatchedForward::B1K1,
-          GemmOpConstantsBatchedForward::MPerXDL,
-          GemmOpConstantsBatchedForward::NPerXDL,
-          GemmOpConstantsBatchedForward::MXdlPerWave,
-          GemmOpConstantsBatchedForward::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsBatchedForward::DropoutStep,
-          GemmOpConstantsBatchedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsBatchedForward::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedForward::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedForward::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedForward::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsBatchedForward::ABlockLdsExtraM,
-          GemmOpConstantsBatchedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedForward::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedForward::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedForward::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedForward::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedForward::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedForward::B1BlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedForward::B1BlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedForward::B1BlockTransferSrcVectorDim,
-          kB1BlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedForward::B1BlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedForward::B1BlockLdsExtraN,
-          GemmOpConstantsBatchedForward::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsBatchedForward::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          kCShuffleBlockTransferScalarPerVector,
-          GemmOpConstantsBatchedForward::Acc1BiasTransferSrcScalarPerVector,
-          MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsBatchedForward::AK1 /
-        GemmOpConstantsBatchedForward::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsBatchedForward::BK1 /
-        GemmOpConstantsBatchedForward::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-        "ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    BATCHED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedForward::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedForward::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
-    std::vector<ck::index_t> a_gs_ms_ks_lengths{
-        param.B, param.Hq, param.M, param.K};
-    std::vector<ck::index_t> a_gs_ms_ks_strides{
-        param.q_strides[0],
-        param.q_strides[2],
-        param.q_strides[1],
-        param.q_strides[3]};
-
-    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-        param.B, param.Hkv, param.N, param.K};
-    std::vector<ck::index_t> b0_gs_ns_ks_strides{
-        param.k_strides[0],
-        param.k_strides[2],
-        param.k_strides[1],
-        param.k_strides[3]};
-
-    // to be changed to b1_gs_ns_os_lengths
-    std::vector<ck::index_t> b1_gs_os_ns_lengths{
-        param.B, param.Hkv, param.Kv, param.N};
-    std::vector<ck::index_t> b1_gs_os_ns_strides{
-        param.v_strides[0],
-        param.v_strides[2],
-        param.v_strides[3],
-        param.v_strides[1]};
-
-    std::vector<ck::index_t> c_gs_ms_os_lengths{
-        param.B, param.Hq, param.M, param.Kv};
-    std::vector<ck::index_t> c_gs_ms_os_strides{
-        param.out_strides[0],
-        param.out_strides[2],
-        param.out_strides[1],
-        param.out_strides[3]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-      d_gs_ms_ns_strides = {
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
-    };
-
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptr,
-        param.k_ptr,
-        param.v_ptr,
-        param.out_ptr,
-        nullptr,
-        param.logsumexp_ptr,
-        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-        {}, // p_acc1_biases;
-        a_gs_ms_ks_lengths,
-        a_gs_ms_ks_strides,
-        b0_gs_ns_ks_lengths,
-        b0_gs_ns_ks_strides,
-        b1_gs_os_ns_lengths,
-        b1_gs_os_ns_strides,
-        c_gs_ms_os_lengths,
-        c_gs_ms_os_strides,
-        {1, 1, 1, 1},
-        {0, 0, 0, 0},
-        lse_gs_ms_lengths,
-        d_gs_ms_ns_lengths,
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op,
-        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-        std::tuple<int64_t, int64_t>(
-            param.philox_seed,
-            param.philox_offset)); // dropout random seed and offset
-
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
-};
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_forward_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream) {
-  batched_forward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
deleted file mode 100644
index 6cc45e3a2..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_bp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_batched_forward.h"
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
deleted file mode 100644
index e153cfa3c..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_forward_fp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_batched_forward.h"
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
deleted file mode 100644
index c72fce2d5..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_batched_mha_infer_xdl_cshuffle.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/math.hpp>
-#include <ck/utility/number.hpp>
-
-#include "ck_align_switch.h"
-#include "ck_fmha_common_gemm_constants.h"
-#include "ck_fmha_infer_gemm_constants.h"
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_params.h"
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct batched_infer_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-#ifndef BATCHED_INFER_HEADDIM_SWITCH
-#define BATCHED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                         \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
-      constexpr ck::index_t kGemm1NPerBlock = 32;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
-      __VA_ARGS__();                                            \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
-      constexpr ck::index_t kGemm1NPerBlock = 64;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
-      __VA_ARGS__();                                            \
-    } else {                                                    \
-      constexpr ck::index_t kGemm1NPerBlock = 128;              \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
-      __VA_ARGS__();                                            \
-    }                                                           \
-  }()
-#endif
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kB1BlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceBatchedMultiheadAttentionInfer_Xdl_CShuffle<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsBatchedInfer::NumGemmKPrefetchStage,
-          GemmOpConstantsBatchedInfer::BlockSize,
-          GemmOpConstantsBatchedInfer::MPerBlock,
-          GemmOpConstantsBatchedInfer::NPerBlock,
-          GemmOpConstantsBatchedInfer::KPerBlock,
-          kGemm1NPerBlock,
-          GemmOpConstantsBatchedInfer::Gemm1KPerBlock,
-          GemmOpConstantsBatchedInfer::AK1,
-          GemmOpConstantsBatchedInfer::BK1,
-          GemmOpConstantsBatchedInfer::B1K1,
-          GemmOpConstantsBatchedInfer::MPerXDL,
-          GemmOpConstantsBatchedInfer::NPerXDL,
-          GemmOpConstantsBatchedInfer::MXdlPerWave,
-          GemmOpConstantsBatchedInfer::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsBatchedInfer::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedInfer::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedInfer::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedInfer::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsBatchedInfer::ABlockLdsExtraM,
-          GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedInfer::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedInfer::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedInfer::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedInfer::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedInfer::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsBatchedInfer::B1BlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsBatchedInfer::B1BlockTransferSrcAccessOrder,
-          GemmOpConstantsBatchedInfer::B1BlockTransferSrcVectorDim,
-          kB1BlockTransferSrcScalarPerVector,
-          GemmOpConstantsBatchedInfer::B1BlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsBatchedInfer::B1BlockLdsExtraN,
-          GemmOpConstantsBatchedInfer::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsBatchedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          kCShuffleBlockTransferScalarPerVector,
-          MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(BatchedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsBatchedInfer::AK1 /
-        GemmOpConstantsBatchedInfer::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsBatchedInfer::BK1 /
-        GemmOpConstantsBatchedInfer::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-        "ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    BATCHED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsBatchedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsBatchedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(BatchedForwardParams& param, hipStream_t stream) {
-    std::vector<ck::index_t> a_gs_ms_ks_lengths{
-        param.B, param.Hq, param.M, param.K};
-    std::vector<ck::index_t> a_gs_ms_ks_strides{
-        param.q_strides[0],
-        param.q_strides[2],
-        param.q_strides[1],
-        param.q_strides[3]};
-
-    std::vector<ck::index_t> b0_gs_ns_ks_lengths{
-        param.B, param.Hkv, param.N, param.K};
-    std::vector<ck::index_t> b0_gs_ns_ks_strides{
-        param.k_strides[0],
-        param.k_strides[2],
-        param.k_strides[1],
-        param.k_strides[3]};
-
-    // to be changed to b1_gs_ns_os_lengths
-    std::vector<ck::index_t> b1_gs_os_ns_lengths{
-        param.B, param.Hkv, param.Kv, param.N};
-    std::vector<ck::index_t> b1_gs_os_ns_strides{
-        param.v_strides[0],
-        param.v_strides[2],
-        param.v_strides[3],
-        param.v_strides[1]};
-
-    std::vector<ck::index_t> c_gs_ms_os_lengths{
-        param.B, param.Hq, param.M, param.Kv};
-    std::vector<ck::index_t> c_gs_ms_os_strides{
-        param.out_strides[0],
-        param.out_strides[2],
-        param.out_strides[1],
-        param.out_strides[3]};
-
-    std::vector<ck::index_t> lse_gs_ms_lengths{param.B, param.Hq, param.M};
-
-    std::vector<ck::index_t> d_gs_ms_ns_lengths;
-    std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-    if constexpr (has_attn_bias) {
-      d_gs_ms_ns_lengths = {param.B, param.Hq, param.M, param.N};
-      d_gs_ms_ns_strides = {
-          param.attn_bias_strides[0],
-          param.attn_bias_strides[1],
-          param.attn_bias_strides[2],
-          param.attn_bias_strides[3]};
-    } else {
-      d_gs_ms_ns_lengths = {1, 1, 1, 1};
-      d_gs_ms_ns_strides = {0, 0, 0, 0};
-    };
-
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptr,
-        param.k_ptr,
-        param.v_ptr,
-        param.out_ptr,
-        param.has_attn_bias ? param.attn_bias_ptr : nullptr,
-        {}, // p_acc1_biases;
-        a_gs_ms_ks_lengths,
-        a_gs_ms_ks_strides,
-        b0_gs_ns_ks_lengths,
-        b0_gs_ns_ks_strides,
-        b1_gs_os_ns_lengths,
-        b1_gs_os_ns_strides,
-        c_gs_ms_os_lengths,
-        c_gs_ms_os_strides,
-        d_gs_ms_ns_lengths,
-        d_gs_ms_ns_strides,
-        {}, // acc1_biases_gs_ms_os_lengths
-        {}, // acc1_biases_gs_ms_os_strides,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op);
-
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
-};
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_batched_infer_masktype_attnbias_dispatched(
-    BatchedForwardParams& param,
-    hipStream_t stream) {
-  batched_infer_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
deleted file mode 100644
index 03a2e36ca..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_bp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_batched_infer.h"
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
deleted file mode 100644
index 4d0625a46..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_batched_infer_fp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_batched_infer.h"
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
-
-void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_batched_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
deleted file mode 100644
index 1fdabf29f..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_common_gemm_constants.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <ck/ck.hpp>
-#include "ck_fmha_op_helper.h"
-
-// list the template parameters that is commonly used
-struct GemmOpConstantsCommon {
-  static constexpr ck::index_t NumDimG = 2;
-  static constexpr ck::index_t NumDimM = 1;
-  static constexpr ck::index_t NumDimN = 1;
-  static constexpr ck::index_t NumDimK = 1;
-  static constexpr ck::index_t NumDimO = 1;
-
-  static constexpr auto TensorSpecA =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB0 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecB1 =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-  static constexpr auto TensorSpecC =
-      ck::tensor_operation::device::TensorSpecialization::Default;
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
deleted file mode 100644
index ab3c159b7..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_forward_gemm_constants.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <ck/ck.hpp>
-#include "ck_fmha_op_helper.h"
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-// clang-format off
-struct GemmOpConstantsBatchedForward {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 32;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 1;
-  static constexpr ck::index_t NXdlPerWave = 4;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t DropoutStep = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
-  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
-  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector = 1; // not actually used by the kernel
-};
-// clang-format on
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-// clang-format off
-struct GemmOpConstantsGroupedForward {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 32;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 1;
-  static constexpr ck::index_t NXdlPerWave = 4;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  static constexpr ck::index_t DropoutStep = 1;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
-  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
-  static constexpr ck::index_t Acc1BiasTransferSrcScalarPerVector = 1; // not actually used by the kernel
-};
-// clang-format on
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
deleted file mode 100644
index b2866cc4f..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward.h
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/math.hpp>
-#include <ck/utility/number.hpp>
-
-#include "ck_align_switch.h"
-#include "ck_fmha_backward_gemm_constants.h"
-#include "ck_fmha_common_gemm_constants.h"
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_params.h"
-
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-struct grouped_backward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-  using Scale = ck::tensor_operation::element_wise::Scale;
-
-  using QKVElementOp = PassThrough;
-  using YElementOp = PassThrough;
-
-  using InputDataType = scalar_t;
-  using OutputDataType =
-      typename std::conditional<use_fp32_qkv_grad, F32, scalar_t>::type;
-  using GemmDataType = scalar_t;
-  using AccDataType = F32;
-  using ShuffleDataType = F32;
-  using LSEDataType = F32;
-  using ZDataType = unsigned short;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr bool Deterministic = true;
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-#ifndef GROUPED_BACKWARD_V1_HEADDIM_SWITCH
-#define GROUPED_BACKWARD_V1_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                               \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                         \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 64, 1, 4>;    \
-      __VA_ARGS__();                                                  \
-    } else {                                                          \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                     \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                    \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;       \
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;    \
-      __VA_ARGS__();                                                  \
-    };                                                                \
-  }()
-#endif
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      typename kCShuffleBlockTransferClusterLengths,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp_V1 = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          InputDataType,
-          OutputDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          ShuffleDataType,
-          QKVElementOp,
-          QKVElementOp,
-          Scale,
-          QKVElementOp,
-          YElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsGroupedBackward_V1::NumGemmKPrefetchStage,
-          GemmOpConstantsGroupedBackward_V1::BlockSize,
-          GemmOpConstantsGroupedBackward_V1::MPerBlock,
-          GemmOpConstantsGroupedBackward_V1::NPerBlock,
-          kGemm1NPerBlock, // KPerBlock = kGemm1NerBlock
-          kGemm1NPerBlock,
-          GemmOpConstantsGroupedBackward_V1::Gemm1KPerBlock,
-          GemmOpConstantsGroupedBackward_V1::Gemm2KPerBlock,
-          GemmOpConstantsGroupedBackward_V1::AK1,
-          GemmOpConstantsGroupedBackward_V1::BK1,
-          GemmOpConstantsGroupedBackward_V1::B1K1,
-          GemmOpConstantsGroupedBackward_V1::MPerXDL,
-          GemmOpConstantsGroupedBackward_V1::NPerXDL,
-          GemmOpConstantsGroupedBackward_V1::MXdlPerWave,
-          GemmOpConstantsGroupedBackward_V1::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsGroupedBackward_V1::Gemm2NXdlPerWave,
-          GemmOpConstantsGroupedBackward_V1::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsGroupedBackward_V1::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedBackward_V1::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedBackward_V1::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V1::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsGroupedBackward_V1::ABlockLdsExtraM,
-          GemmOpConstantsGroupedBackward_V1::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedBackward_V1::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedBackward_V1::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedBackward_V1::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V1::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedBackward_V1::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V2::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths,
-          kCShuffleBlockTransferScalarPerVector,
-          MaskingSpec,
-          Deterministic>;
-  // clang-format on
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      typename kCShuffleBlockTransferClusterLengths,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kB1BlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp_V2 = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          InputDataType,
-          OutputDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          ShuffleDataType,
-          QKVElementOp,
-          QKVElementOp,
-          Scale,
-          QKVElementOp,
-          YElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsGroupedBackward_V2::NumGemmKPrefetchStage,
-          GemmOpConstantsGroupedBackward_V2::BlockSize,
-          GemmOpConstantsGroupedBackward_V2::MPerBlock,
-          GemmOpConstantsGroupedBackward_V2::NPerBlock,
-          GemmOpConstantsGroupedBackward_V2::KPerBlock,
-          kGemm1NPerBlock,
-          GemmOpConstantsGroupedBackward_V2::Gemm1KPerBlock,
-          GemmOpConstantsGroupedBackward_V2::Gemm2KPerBlock,
-          GemmOpConstantsGroupedBackward_V2::AK1,
-          GemmOpConstantsGroupedBackward_V2::BK1,
-          GemmOpConstantsGroupedBackward_V2::B1K1,
-          GemmOpConstantsGroupedBackward_V2::MPerXDL,
-          GemmOpConstantsGroupedBackward_V2::NPerXDL,
-          GemmOpConstantsGroupedBackward_V2::MXdlPerWave,
-          GemmOpConstantsGroupedBackward_V2::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsBatchedBackward_V2::Gemm2NXdlPerWave,
-          GemmOpConstantsGroupedBackward_V2::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsGroupedBackward_V2::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedBackward_V2::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedBackward_V2::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V2::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsGroupedBackward_V2::ABlockLdsExtraM,
-          GemmOpConstantsGroupedBackward_V2::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedBackward_V2::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedBackward_V2::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedBackward_V2::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V2::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedBackward_V2::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V2::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedBackward_V2::B1BlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedBackward_V2::B1BlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedBackward_V2::B1BlockTransferSrcVectorDim,
-          kB1BlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedBackward_V2::B1BlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedBackward_V2::B1BlockLdsExtraN,
-          GemmOpConstantsGroupedBackward_V2::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          kCShuffleBlockTransferClusterLengths,
-          kCShuffleBlockTransferScalarPerVector,
-          MaskingSpec,
-          Deterministic>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    if (param.K <= 64 && param.Kv <= 64) {
-      // compile-time constants which don't depend on head-dim switching
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedBackward_V1::AK1 /
-          GemmOpConstantsGroupedBackward_V1::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedBackward_V1::BK1 /
-          GemmOpConstantsGroupedBackward_V1::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-          "and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      GROUPED_BACKWARD_V1_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-        constexpr ck::index_t thread_slice_length_cshuflle_n =
-            (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-             kGemm1NXdlPerWave) /
-            kCShuffleBlockTransferClusterLengths::At(I3);
-
-        constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-            min(2, thread_slice_length_cshuflle_n);
-
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              using DeviceOpInstance = DeviceOpInstanceTemp_V1<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      });
-    } else {
-      constexpr ck::index_t kGemm1NPerBlock = 128;
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;
-      using kCShuffleBlockTransferClusterLengths = S<1, 32, 1, 8>;
-
-      constexpr ck::index_t thread_slice_length_ak1 =
-          GemmOpConstantsGroupedBackward_V2::AK1 /
-          GemmOpConstantsGroupedBackward_V2::
-              ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-      constexpr ck::index_t thread_slice_length_bk1 =
-          GemmOpConstantsGroupedBackward_V2::BK1 /
-          GemmOpConstantsGroupedBackward_V2::
-              BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-      static_assert(
-          thread_slice_length_ak1 == thread_slice_length_bk1,
-          "ABlockTransfer and BBlockTransfer should use completely same K1 sizes "
-          "and ThreadClusterLengths!");
-
-      constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_ak1);
-
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedBackward_V2::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          kCShuffleBlockTransferClusterLengths::At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(2, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp_V2<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kCShuffleBlockTransferClusterLengths,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    };
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(
-      GroupedBackwardParams& param,
-      hipStream_t stream) {
-    // Tunables
-    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-    for (std::size_t i = 0; i < param.num_batches; i++) {
-      int M =
-          param.host_seqstart_q[i + 1] - param.host_seqstart_q[i]; // seqlen Q
-      int N = param.host_seqlen_k.empty()
-          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-          : param.host_seqlen_k[i];
-      int K = param.K;
-      int Kv = param.Kv;
-      int G1q = param.Hq;
-      int G1kv = param.Hkv;
-
-      std::vector<ck::index_t> q_gs_ms_ks_lengths{1, G1q, M, K};
-      std::vector<ck::index_t> q_gs_ms_ks_strides{
-          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-      std::vector<ck::index_t> k_gs_ns_ks_lengths{1, G1kv, N, K};
-      std::vector<ck::index_t> k_gs_ns_ks_strides{
-          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-      std::vector<ck::index_t> kgrad_gs_ns_ks_lengths = {1, G1q, N, K};
-      std::vector<ck::index_t> kgrad_gs_ns_ks_strides = {
-          0,
-          param.tmp_grad_k_strides[1],
-          param.tmp_grad_k_strides[0],
-          param.tmp_grad_k_strides[2]};
-
-      // to be changed to v_gs_ns_os_lengths
-      std::vector<ck::index_t> v_gs_os_ns_lengths{1, G1kv, Kv, N};
-      std::vector<ck::index_t> v_gs_os_ns_strides{
-          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-      std::vector<ck::index_t> vgrad_gs_os_ns_lengths = {1, G1q, Kv, N};
-      std::vector<ck::index_t> vgrad_gs_os_ns_strides = {
-          0,
-          param.tmp_grad_v_strides[1],
-          param.tmp_grad_v_strides[2],
-          param.tmp_grad_v_strides[0]};
-
-      std::vector<ck::index_t> y_gs_ms_os_lengths{1, G1q, M, Kv};
-      std::vector<ck::index_t> y_gs_ms_os_strides{
-          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
-      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-      std::vector<ck::index_t> d_gs_ms_ns_lengths;
-      std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-      if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1q, M, N};
-        d_gs_ms_ns_strides = {
-            0,
-            param.attn_bias_strides[0],
-            param.attn_bias_strides[1],
-            param.attn_bias_strides[2]};
-      } else {
-        d_gs_ms_ns_lengths = {1, 1, 1, 1};
-        d_gs_ms_ns_strides = {0, 0, 0, 0};
-      };
-
-      problem_descs.push_back({
-          q_gs_ms_ks_lengths, // q, dQ should have same shape
-          q_gs_ms_ks_strides,
-          k_gs_ns_ks_lengths, // k, dK should have same shape
-          k_gs_ns_ks_strides,
-          {1, 1, 1, 1},
-          {0, 0, 0, 0},
-          v_gs_os_ns_lengths, // v, dV should have same shape
-          v_gs_os_ns_strides,
-          y_gs_ms_os_lengths, // y, dY should have same shape
-          y_gs_ms_os_strides,
-          lse_gs_ms_lengths,
-          lse_gs_ms_strides,
-          param.is_mqa_gqa ? kgrad_gs_ns_ks_lengths : k_gs_ns_ks_lengths,
-          param.is_mqa_gqa ? kgrad_gs_ns_ks_strides : k_gs_ns_ks_strides,
-          param.is_mqa_gqa ? vgrad_gs_os_ns_lengths : v_gs_os_ns_lengths,
-          param.is_mqa_gqa ? vgrad_gs_os_ns_strides : v_gs_os_ns_strides,
-          d_gs_ms_ns_lengths, // bias, grad_bias should have same shape
-          d_gs_ms_ns_strides,
-          {}, // acc1_biases_gs_ms_os_lengths
-          {}, // acc1_biases_gs_ms_os_strides
-      });
-    }
-
-    float alpha = param.scale;
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptrs,
-        param.k_ptrs,
-        param.randvals_ptrs,
-        param.v_ptrs,
-        param.out_ptrs,
-        param.logsumexp_ptrs,
-        param.grad_out_ptrs,
-        param.grad_q_ptrs,
-        param.grad_k_ptrs,
-        param.grad_v_ptrs,
-        param.attn_bias_ptrs,
-        {}, // p_acc1_bias_vec;
-        param.grad_bias_ptrs,
-        {},
-        problem_descs,
-        QKVElementOp{},
-        QKVElementOp{},
-        Scale{alpha},
-        QKVElementOp{},
-        YElementOp{},
-        param.dropout_prob,
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-    SimpleDeviceMem workspace(op.GetWorkSpaceSize(arg_ptr.get()));
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
-};
-
-template <
-    typename scalar_t,
-    int32_t custom_mask_type,
-    bool has_attn_bias,
-    bool use_fp32_qkv_grad>
-void run_grouped_backward_masktype_attnbias_dispatched(
-    GroupedBackwardParams& param,
-    hipStream_t stream) {
-  grouped_backward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias,
-      use_fp32_qkv_grad>::Run(param, stream);
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
deleted file mode 100644
index 0e3f4f8fa..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_bp16.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_grouped_backward.h"
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 1) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 2) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::bhalf_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
deleted file mode 100644
index ca8a0a4d3..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_backward_fp16.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_grouped_backward.h"
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
-
-void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.use_fp32_qkv_grad,
-      USE_FP32_QKV_GRAD,
-      [&] {
-        if (param.custom_mask_type == 0) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              0,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 1) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              1,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else if (param.custom_mask_type == 2) {
-          run_grouped_backward_masktype_attnbias_dispatched<
-              ck::half_t,
-              2,
-              HAS_ATTN_BIAS,
-              USE_FP32_QKV_GRAD>(param, stream);
-        } else
-          throw std::runtime_error("Invalid custom_mask_type value");
-      });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
deleted file mode 100644
index 0095ec2a7..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward.h
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/math.hpp>
-#include <ck/utility/number.hpp>
-
-#include "ck_align_switch.h"
-#include "ck_fmha_common_gemm_constants.h"
-#include "ck_fmha_forward_gemm_constants.h"
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_params.h"
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_forward_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-#ifndef GROUPED_FORWARD_HEADDIM_SWITCH
-#define GROUPED_FORWARD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                           \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                     \
-      constexpr ck::index_t kGemm1NPerBlock = 32;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1;   \
-      __VA_ARGS__();                                              \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {              \
-      constexpr ck::index_t kGemm1NPerBlock = 64;                 \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2;   \
-      __VA_ARGS__();                                              \
-    } else {                                                      \
-      constexpr ck::index_t kGemm1NPerBlock = 128;                \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;                \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4;   \
-      __VA_ARGS__();                                              \
-    }                                                             \
-  }()
-#endif
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kB1BlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          GemmDataType,
-          ZDataType,
-          LSEDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsGroupedForward::NumGemmKPrefetchStage,
-          GemmOpConstantsGroupedForward::BlockSize,
-          GemmOpConstantsGroupedForward::MPerBlock,
-          GemmOpConstantsGroupedForward::NPerBlock,
-          GemmOpConstantsGroupedForward::KPerBlock,
-          kGemm1NPerBlock,
-          GemmOpConstantsGroupedForward::Gemm1KPerBlock,
-          GemmOpConstantsGroupedForward::AK1,
-          GemmOpConstantsGroupedForward::BK1,
-          GemmOpConstantsGroupedForward::B1K1,
-          GemmOpConstantsGroupedForward::MPerXDL,
-          GemmOpConstantsGroupedForward::NPerXDL,
-          GemmOpConstantsGroupedForward::MXdlPerWave,
-          GemmOpConstantsGroupedForward::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsGroupedForward::DropoutStep,
-          GemmOpConstantsGroupedForward::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsGroupedForward::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedForward::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedForward::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedForward::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsGroupedForward::ABlockLdsExtraM,
-          GemmOpConstantsGroupedForward::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedForward::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedForward::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedForward::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedForward::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedForward::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedForward::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedForward::B1BlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedForward::B1BlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedForward::B1BlockTransferSrcVectorDim,
-          kB1BlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedForward::B1BlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedForward::B1BlockLdsExtraN,
-          GemmOpConstantsGroupedForward::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsGroupedForward::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          kCShuffleBlockTransferScalarPerVector,
-          GemmOpConstantsGroupedForward::Acc1BiasTransferSrcScalarPerVector,
-          MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsGroupedForward::AK1 /
-        GemmOpConstantsGroupedForward::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsGroupedForward::BK1 /
-        GemmOpConstantsGroupedForward::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-        "ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    GROUPED_FORWARD_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedForward::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(2, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsGroupedForward::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
-    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-    for (std::size_t i = 0; i < param.num_batches; i++) {
-      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-      int N = param.host_seqlen_k.empty()
-          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-          : param.host_seqlen_k[i];
-      int K = param.K;
-      int Kv = param.Kv;
-      int G1q = param.Hq;
-      int G1kv = param.Hkv;
-
-      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
-      std::vector<ck::index_t> a_gs_ms_ks_strides{
-          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
-      std::vector<ck::index_t> b0_gs_ns_ks_strides{
-          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-      // to be changed to b1_gs_ns_os_lengths
-      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
-      std::vector<ck::index_t> b1_gs_os_ns_strides{
-          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
-      std::vector<ck::index_t> c_gs_ms_os_strides{
-          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-      std::vector<ck::index_t> lse_gs_ms_lengths{1, G1q, M};
-      std::vector<ck::index_t> lse_gs_ms_strides{0, param.max_seqlen_q, 1};
-
-      std::vector<ck::index_t> d_gs_ms_ns_lengths;
-      std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-      if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1q, M, N};
-        d_gs_ms_ns_strides = {
-            0,
-            param.attn_bias_strides[0],
-            param.attn_bias_strides[1],
-            param.attn_bias_strides[2]};
-      } else {
-        d_gs_ms_ns_lengths = {1, 1, 1, 1};
-        d_gs_ms_ns_strides = {0, 0, 0, 0};
-      };
-
-      problem_descs.push_back(
-          {a_gs_ms_ks_lengths,
-           a_gs_ms_ks_strides,
-           b0_gs_ns_ks_lengths,
-           b0_gs_ns_ks_strides,
-           b1_gs_os_ns_lengths,
-           b1_gs_os_ns_strides,
-           c_gs_ms_os_lengths,
-           c_gs_ms_os_strides,
-           {1, 1, 1, 1},
-           {0, 0, 0, 0},
-           lse_gs_ms_lengths,
-           lse_gs_ms_strides,
-           d_gs_ms_ns_lengths,
-           d_gs_ms_ns_strides,
-           {}, // acc1_bias_gs_ms_os_lengths
-           {}}); // acc1_bias_gs_ms_os_strides
-    }
-
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptrs,
-        param.k_ptrs,
-        param.v_ptrs,
-        param.out_ptrs,
-        param.randvals_ptrs,
-        param.logsumexp_ptrs,
-        param.attn_bias_ptrs,
-        {}, // p_acc1_biases
-        problem_descs,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op,
-        param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
-        std::tuple<int64_t, int64_t>(param.philox_seed, param.philox_offset));
-
-    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-    SimpleDeviceMem workspace(sizeInBytes);
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
-};
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_forward_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream) {
-  grouped_forward_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
deleted file mode 100644
index 72ebd715e..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_bp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_grouped_forward.h"
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
deleted file mode 100644
index eb53ad433..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_forward_fp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_grouped_forward.h"
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_forward_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
deleted file mode 100644
index fbc0b2b1a..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <sstream>
-#include <stdexcept>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_specialization.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/math.hpp>
-#include <ck/utility/number.hpp>
-
-#include "ck_align_switch.h"
-#include "ck_fmha_common_gemm_constants.h"
-#include "ck_fmha_infer_gemm_constants.h"
-#include "ck_fmha_op_helper.h"
-#include "ck_fmha_params.h"
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-struct grouped_infer_masktype_attnbias_dispatched {
-  using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-  using GemmDataType = scalar_t;
-  using ADataType = scalar_t;
-  using B0DataType = scalar_t;
-  using B1DataType = scalar_t;
-  using AccDataType = F32;
-  using CShuffleDataType = F32;
-  using CDataType = scalar_t;
-  using ZDataType = unsigned short;
-  using LSEDataType = F32;
-  using Acc0BiasDataType =
-      typename std::conditional<has_attn_bias, scalar_t, void>::type;
-  using Acc1BiasDataType = void;
-
-  using AElementOp = PassThrough;
-  using B0ElementOp = PassThrough;
-  using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
-  using B1ElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-  static constexpr auto GemmSpec =
-      ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-  static constexpr auto MaskingSpec =
-      static_cast<ck::tensor_operation::device::MaskingSpecialization>(
-          custom_mask_type);
-
-  static constexpr ck::index_t kAcc0BiasTransferSrcScalarPerVector = 1;
-
-#ifndef GROUPED_INFER_HEADDIM_SWITCH
-#define GROUPED_INFER_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, ...) \
-  [&] {                                                         \
-    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                   \
-      constexpr ck::index_t kGemm1NPerBlock = 32;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 1;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 1; \
-      __VA_ARGS__();                                            \
-    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {            \
-      constexpr ck::index_t kGemm1NPerBlock = 64;               \
-      constexpr ck::index_t kGemm1NXdlPerWave = 2;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 2; \
-      __VA_ARGS__();                                            \
-    } else {                                                    \
-      constexpr ck::index_t kGemm1NPerBlock = 128;              \
-      constexpr ck::index_t kGemm1NXdlPerWave = 4;              \
-      constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = 4; \
-      __VA_ARGS__();                                            \
-    }                                                           \
-  }()
-#endif
-
-  // clang-format off
-  template <
-      ck::index_t kGemm1NPerBlock,
-      ck::index_t kGemm1NXdlPerWave,
-      ck::index_t kCShuffleNXdlPerWavePerShuffle,
-      ck::index_t kABBlockTransferSrcScalarPerVector,
-      ck::index_t kB1BlockTransferSrcScalarPerVector,
-      ck::index_t kCShuffleBlockTransferScalarPerVector>
-  using DeviceOpInstanceTemp = ck::tensor_operation::device::DeviceGroupedMultiheadAttentionInfer_Xdl_CShuffle<
-          GemmOpConstantsCommon::NumDimG,
-          GemmOpConstantsCommon::NumDimM,
-          GemmOpConstantsCommon::NumDimN,
-          GemmOpConstantsCommon::NumDimK,
-          GemmOpConstantsCommon::NumDimO,
-          ADataType,
-          B0DataType,
-          B1DataType,
-          CDataType,
-          Acc0BiasDataType,
-          Acc1BiasDataType,
-          AccDataType,
-          CShuffleDataType,
-          AElementOp,
-          B0ElementOp,
-          Acc0ElementOp,
-          B1ElementOp,
-          CElementOp,
-          GemmSpec,
-          GemmOpConstantsCommon::TensorSpecA,
-          GemmOpConstantsCommon::TensorSpecB0,
-          GemmOpConstantsCommon::TensorSpecB1,
-          GemmOpConstantsCommon::TensorSpecC,
-          GemmOpConstantsBatchedInfer::NumGemmKPrefetchStage,
-          GemmOpConstantsGroupedInfer::BlockSize,
-          GemmOpConstantsGroupedInfer::MPerBlock,
-          GemmOpConstantsGroupedInfer::NPerBlock,
-          GemmOpConstantsGroupedInfer::KPerBlock,
-          kGemm1NPerBlock,
-          GemmOpConstantsGroupedInfer::Gemm1KPerBlock,
-          GemmOpConstantsGroupedInfer::AK1,
-          GemmOpConstantsGroupedInfer::BK1,
-          GemmOpConstantsGroupedInfer::B1K1,
-          GemmOpConstantsGroupedInfer::MPerXDL,
-          GemmOpConstantsGroupedInfer::NPerXDL,
-          GemmOpConstantsGroupedInfer::MXdlPerWave,
-          GemmOpConstantsGroupedInfer::NXdlPerWave,
-          kGemm1NXdlPerWave,
-          GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          GemmOpConstantsGroupedInfer::ABlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedInfer::ABlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedInfer::ABlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector, 
-          GemmOpConstantsGroupedInfer::ABlockTransferDstScalarPerVector_AK1,
-          GemmOpConstantsGroupedInfer::ABlockLdsExtraM,
-          GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedInfer::BBlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedInfer::BBlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedInfer::BBlockTransferSrcVectorDim,
-          kABBlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedInfer::BBlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedInfer::BBlockLdsExtraN,
-          kAcc0BiasTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterLengths_BK0_N_BK1,
-          GemmOpConstantsGroupedInfer::B1BlockTransferThreadClusterArrangeOrder,
-          GemmOpConstantsGroupedInfer::B1BlockTransferSrcAccessOrder,
-          GemmOpConstantsGroupedInfer::B1BlockTransferSrcVectorDim,
-          kB1BlockTransferSrcScalarPerVector,
-          GemmOpConstantsGroupedInfer::B1BlockTransferDstScalarPerVector_BK1,
-          GemmOpConstantsGroupedInfer::B1BlockLdsExtraN,
-          GemmOpConstantsGroupedInfer::CShuffleMXdlPerWavePerShuffle,
-          kCShuffleNXdlPerWavePerShuffle,
-          GemmOpConstantsGroupedInfer::CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          kCShuffleBlockTransferScalarPerVector,
-          MaskingSpec>;
-  // clang-format on
-
-  static constexpr auto I1 = ck::Number<1>{};
-  static constexpr auto I2 = ck::Number<2>{};
-  static constexpr auto I3 = ck::Number<3>{};
-
-  static void Run(GroupedForwardParams& param, hipStream_t stream) {
-    using ck::math::min;
-
-    // compile-time constants which don't depend on head-dim switching
-    constexpr ck::index_t thread_slice_length_ak1 =
-        GemmOpConstantsGroupedInfer::AK1 /
-        GemmOpConstantsGroupedInfer::
-            ABlockTransferThreadClusterLengths_AK0_M_AK1::At(I2);
-    constexpr ck::index_t thread_slice_length_bk1 =
-        GemmOpConstantsGroupedInfer::BK1 /
-        GemmOpConstantsGroupedInfer::
-            BBlockTransferThreadClusterLengths_BK0_N_BK1::At(I2);
-
-    static_assert(
-        thread_slice_length_ak1 == thread_slice_length_bk1,
-        "ABlockTransfer and BBlockTransfer should use completely same K1 sizes and "
-        "ThreadClusterLengths!");
-
-    constexpr ck::index_t kABBlockTransferSrcScalarPerVector_max =
-        min(8, thread_slice_length_ak1);
-
-    GROUPED_INFER_HEADDIM_SWITCH(param.K, param.Kv, [&] {
-      constexpr ck::index_t thread_slice_length_gemm1n = kGemm1NPerBlock /
-          GemmOpConstantsGroupedInfer::
-              B1BlockTransferThreadClusterLengths_BK0_N_BK1::At(I1);
-      constexpr ck::index_t kB1BlockTransferSrcScalarPerVector_max =
-          min(4, thread_slice_length_gemm1n);
-
-      constexpr ck::index_t thread_slice_length_cshuflle_n =
-          (kCShuffleNXdlPerWavePerShuffle * kGemm1NPerBlock /
-           kGemm1NXdlPerWave) /
-          GemmOpConstantsGroupedInfer::
-              CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock ::
-                  At(I3);
-
-      constexpr ck::index_t kCShuffleBlockTransferScalarPerVector_max =
-          min(4, thread_slice_length_cshuflle_n);
-
-      if constexpr (
-          kB1BlockTransferSrcScalarPerVector_max >=
-          kCShuffleBlockTransferScalarPerVector_max) {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kB1BlockTransferSrcScalarPerVector_max,
-            kB1BlockTransferSrcScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kCShuffleBlockTransferScalarPerVector =
-                  min(kB1BlockTransferSrcScalarPerVector,
-                      kCShuffleBlockTransferScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      } else {
-        ALIGN_SWITCH_2(
-            kABBlockTransferSrcScalarPerVector_max,
-            kABBlockTransferSrcScalarPerVector,
-            param.K,
-            kCShuffleBlockTransferScalarPerVector_max,
-            kCShuffleBlockTransferScalarPerVector,
-            param.Kv,
-            [&] {
-              constexpr ck::index_t kB1BlockTransferSrcScalarPerVector =
-                  min(kCShuffleBlockTransferScalarPerVector,
-                      kB1BlockTransferSrcScalarPerVector_max);
-              using DeviceOpInstance = DeviceOpInstanceTemp<
-                  kGemm1NPerBlock,
-                  kGemm1NXdlPerWave,
-                  kCShuffleNXdlPerWavePerShuffle,
-                  kABBlockTransferSrcScalarPerVector,
-                  kB1BlockTransferSrcScalarPerVector,
-                  kCShuffleBlockTransferScalarPerVector>;
-
-              RunWithDeviceOp<DeviceOpInstance>(param, stream);
-            });
-      };
-    });
-  };
-
-  template <typename DeviceOpInstance>
-  static void RunWithDeviceOp(GroupedForwardParams& param, hipStream_t stream) {
-    std::vector<typename DeviceOpInstance::ProblemDesc> problem_descs;
-
-    for (std::size_t i = 0; i < param.num_batches; i++) {
-      int M = param.host_seqstart_q[i + 1] - param.host_seqstart_q[i];
-      int N = param.host_seqlen_k.empty()
-          ? param.host_seqstart_k[i + 1] - param.host_seqstart_k[i]
-          : param.host_seqlen_k[i];
-      int K = param.K;
-      int Kv = param.Kv;
-      int G1q = param.Hq;
-      int G1kv = param.Hkv;
-
-      std::vector<ck::index_t> a_gs_ms_ks_lengths{1, G1q, M, K};
-      std::vector<ck::index_t> a_gs_ms_ks_strides{
-          0, param.q_strides[1], param.q_strides[0], param.q_strides[2]};
-
-      std::vector<ck::index_t> b0_gs_ns_ks_lengths{1, G1kv, N, K};
-      std::vector<ck::index_t> b0_gs_ns_ks_strides{
-          0, param.k_strides[1], param.k_strides[0], param.k_strides[2]};
-
-      // to be changed to b1_gs_ns_os_lengths
-      std::vector<ck::index_t> b1_gs_os_ns_lengths{1, G1kv, Kv, N};
-      std::vector<ck::index_t> b1_gs_os_ns_strides{
-          0, param.v_strides[1], param.v_strides[2], param.v_strides[0]};
-
-      std::vector<ck::index_t> c_gs_ms_os_lengths{1, G1q, M, Kv};
-      std::vector<ck::index_t> c_gs_ms_os_strides{
-          0, param.out_strides[1], param.out_strides[0], param.out_strides[2]};
-
-      std::vector<ck::index_t> d_gs_ms_ns_lengths;
-      std::vector<ck::index_t> d_gs_ms_ns_strides;
-
-      if constexpr (has_attn_bias) {
-        d_gs_ms_ns_lengths = {1, G1q, M, N};
-        d_gs_ms_ns_strides = {
-            0,
-            param.attn_bias_strides[0],
-            param.attn_bias_strides[1],
-            param.attn_bias_strides[2]};
-      } else {
-        d_gs_ms_ns_lengths = {1, 1, 1, 1};
-        d_gs_ms_ns_strides = {0, 0, 0, 0};
-      };
-
-      problem_descs.push_back(
-          {a_gs_ms_ks_lengths,
-           a_gs_ms_ks_strides,
-           b0_gs_ns_ks_lengths,
-           b0_gs_ns_ks_strides,
-           b1_gs_os_ns_lengths,
-           b1_gs_os_ns_strides,
-           c_gs_ms_os_lengths,
-           c_gs_ms_os_strides,
-           d_gs_ms_ns_lengths,
-           d_gs_ms_ns_strides,
-           {}, // acc1_bias_gs_ms_os_lengths
-           {}}); // acc1_bias_gs_ms_os_strides
-    }
-
-    float alpha = param.scale;
-
-    auto a_element_op = AElementOp{};
-    auto b0_element_op = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{alpha};
-    auto b1_element_op = B1ElementOp{};
-    auto c_element_op = CElementOp{};
-
-    auto op = DeviceOpInstance{};
-    auto invoker = op.MakeInvoker();
-
-    auto arg_ptr = op.MakeArgumentPointer(
-        param.q_ptrs,
-        param.k_ptrs,
-        param.v_ptrs,
-        param.out_ptrs,
-        param.attn_bias_ptrs,
-        {}, // p_acc1_biases
-        problem_descs,
-        a_element_op,
-        b0_element_op,
-        acc0_element_op,
-        b1_element_op,
-        c_element_op);
-
-    auto sizeInBytes = op.GetWorkSpaceSize(arg_ptr.get());
-
-    SimpleDeviceMem workspace(sizeInBytes);
-
-    op.SetWorkSpacePointer(arg_ptr.get(), workspace.GetDeviceBuffer());
-
-    if (!op.IsSupportedArgument(arg_ptr.get())) {
-      std::ostringstream ostr;
-
-      ostr << op.GetTypeString() << " does not support this problem";
-
-      throw std::runtime_error(ostr.str());
-    }
-
-    (void)invoker.Run(arg_ptr.get(), StreamConfig{stream, false});
-  };
-};
-
-template <typename scalar_t, int32_t custom_mask_type, bool has_attn_bias>
-void run_grouped_infer_masktype_attnbias_dispatched(
-    GroupedForwardParams& param,
-    hipStream_t stream) {
-  grouped_infer_masktype_attnbias_dispatched<
-      scalar_t,
-      custom_mask_type,
-      has_attn_bias>::Run(param, stream);
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
deleted file mode 100644
index ef1014398..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_bp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_grouped_infer.h"
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::bhalf_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
deleted file mode 100644
index 7fa075c85..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_grouped_infer_fp16.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_bool_switch.h"
-#include "ck_fmha_grouped_infer.h"
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
-
-void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_1(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    if (param.custom_mask_type == 0)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          0,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 1)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          1,
-          HAS_ATTN_BIAS>(param, stream);
-    else if (param.custom_mask_type == 2)
-      run_grouped_infer_masktype_attnbias_dispatched<
-          ck::half_t,
-          2,
-          HAS_ATTN_BIAS>(param, stream);
-    else
-      throw std::runtime_error("Invalid custom_mask_type value");
-  });
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h b/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
deleted file mode 100644
index 0b7708fe0..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_infer_gemm_constants.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <ck/ck.hpp>
-#include "ck_fmha_op_helper.h"
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-// clang-format off
-struct GemmOpConstantsBatchedInfer {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 32;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 1;
-  static constexpr ck::index_t NXdlPerWave = 4;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
-  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-//clang-format on
-
-// list the template parameters that will not be tuned,
-// the commented lines gives the tunable template parameters
-// clang-format off
-struct GemmOpConstantsGroupedInfer {
-  static constexpr ck::index_t NumGemmKPrefetchStage = 1;
-  static constexpr ck::index_t BlockSize = 256;
-  static constexpr ck::index_t MPerBlock = 128;
-  static constexpr ck::index_t NPerBlock = 128;
-  static constexpr ck::index_t KPerBlock = 32;
-  // static constexpr ck::index_t Gemm1NPerBlock;
-  static constexpr ck::index_t Gemm1KPerBlock = 32;
-  static constexpr ck::index_t AK1 = 8;
-  static constexpr ck::index_t BK1 = 8;
-  static constexpr ck::index_t B1K1 = 2;
-  static constexpr ck::index_t MPerXDL = 32;
-  static constexpr ck::index_t NPerXDL = 32;
-  static constexpr ck::index_t MXdlPerWave = 1;
-  static constexpr ck::index_t NXdlPerWave = 4;
-  // static constexpr ck::index_t Gemm1NXdlPerWave;
-  using ABlockTransferThreadClusterLengths_AK0_M_AK1 = S<4, 64, 1>;
-  using ABlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using ABlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t ABlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t ABlockTransferSrcScalarPerVector,
-  static constexpr ck::index_t ABlockTransferDstScalarPerVector_AK1 = 4;
-  static constexpr bool ABlockLdsExtraM = true;
-  using BBlockTransferThreadClusterLengths_BK0_N_BK1 = S<4, 64, 1>;
-  using BBlockTransferThreadClusterArrangeOrder = S<1, 0, 2>;
-  using BBlockTransferSrcAccessOrder = S<1, 0, 2>;
-  static constexpr ck::index_t BBlockTransferSrcVectorDim = 2;
-  // static constexpr ck::index_t BBlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t BBlockTransferDstScalarPerVector_BK1 = 4;
-  static constexpr bool BBlockLdsExtraN = true;
-  // static constexpr ck::index_t Acc0BiasTransferSrcScalarPerVector;
-  using B1BlockTransferThreadClusterLengths_BK0_N_BK1 = S<16, 16, 1>;
-  using B1BlockTransferThreadClusterArrangeOrder = S<0, 2, 1>;
-  using B1BlockTransferSrcAccessOrder = S<0, 2, 1>;
-  static constexpr ck::index_t B1BlockTransferSrcVectorDim = 1;
-  // static constexpr ck::index_t B1BlockTransferSrcScalarPerVector;
-  static constexpr ck::index_t B1BlockTransferDstScalarPerVector_BK1 = 2;
-  static constexpr bool B1BlockLdsExtraN = false;
-  static constexpr ck::index_t CShuffleMXdlPerWavePerShuffle = 1;
-  // static constexpr ck::index_t CShuffleNXdlPerWavePerShuffle;
-  using CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock = S<1, 8, 1, 32>;
-  // static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock;
-};
-// clang-format on
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h b/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
deleted file mode 100644
index 24ab800e9..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_op_helper.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <map>
-#include <mutex>
-
-#include <c10/hip/HIPCachingAllocator.h>
-#include <ck/utility/data_type.hpp>
-
-template <typename scalar_t>
-struct MaxVectorSizeForType {
-  static constexpr int value = 4;
-};
-
-template <>
-struct MaxVectorSizeForType<ck::half_t> {
-  static constexpr int value = 8;
-};
-
-template <>
-struct MaxVectorSizeForType<ck::bhalf_t> {
-  static constexpr int value = 8;
-};
-
-struct SimpleDeviceMem {
-  SimpleDeviceMem() = delete;
-  SimpleDeviceMem(size_t sizeInBytes) {
-    pData_ = c10::hip::HIPCachingAllocator::raw_alloc(sizeInBytes);
-  }
-  void* GetDeviceBuffer() {
-    return pData_;
-  }
-  ~SimpleDeviceMem() {
-    c10::cuda::HIPCachingAllocator::raw_delete(pData_);
-  }
-
-  void* pData_;
-};
-
-// useful aliasing for making the codes easy
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using F32 = float;
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
deleted file mode 100644
index 918126591..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_params.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <array>
-#include <cstdint>
-
-struct BatchedInferParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-
-  // BMHK mode strides
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> out_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* out_ptr;
-};
-
-struct BatchedForwardParams : public BatchedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // completely contiguous
-  void* logsumexp_ptr;
-};
-
-struct GroupedInferParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  int max_seqlen_q;
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<void*> out_ptrs;
-
-  uint8_t custom_mask_type;
-};
-
-struct GroupedForwardParams : public GroupedInferParams {
-  bool use_dropout;
-  bool compute_logsumexp;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // completely contiguous
-  std::vector<void*> logsumexp_ptrs;
-
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
-};
-
-struct BatchedBackwardParams {
-  int B; // batch size
-  int M; // seq_len for Query
-  int N; // seq_len for Key and Value
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  float scale;
-  bool has_attn_bias;
-  bool bias_has_grad;
-
-  bool use_fp32_qkv_grad;
-  bool is_mqa_gqa;
-
-  // BMHK mode strides, last-dim contiguous
-  std::array<int, 4> q_strides;
-  std::array<int, 4> k_strides;
-  std::array<int, 4> v_strides;
-  std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
-  std::array<int, 4> out_strides;
-
-  std::array<int, 4> tmp_grad_k_strides;
-  std::array<int, 4> tmp_grad_v_strides;
-
-  const void* q_ptr;
-  const void* k_ptr;
-  const void* v_ptr;
-  const void* attn_bias_ptr;
-  const void* grad_out_ptr;
-  const void* out_ptr;
-
-  uint8_t custom_mask_type;
-
-  void* grad_q_ptr;
-  void* grad_k_ptr;
-  void* grad_v_ptr;
-  void* grad_bias_ptr;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // BHM mode lengths, completely contiguous
-  const void* logsumexp_ptr;
-};
-
-struct GroupedBackwardParams {
-  int num_batches;
-  int M; // total seq_len for all queries in the batch
-  int N; // total seq_len for all keys/values in the batch
-  int Hq; // number of heads for Query
-  int Hkv; // number of heads for Key and Value
-  int K; // embed_dim for Query and Key
-  int Kv; // embed_dim for Value
-
-  int max_seqlen_q;
-
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
-
-  float scale;
-  bool has_attn_bias;
-  bool bias_has_grad;
-
-  bool use_fp32_qkv_grad;
-  bool is_mqa_gqa;
-
-  // MHK mode strides, last-dim contiguous
-  std::array<int, 3> q_strides;
-  std::array<int, 3> k_strides;
-  std::array<int, 3> v_strides;
-  std::array<int, 3> out_strides;
-  // 4d tensor view [B, H, M, N]
-  std::array<int, 4> attn_bias_strides;
-
-  std::array<int, 3> tmp_grad_k_strides;
-  std::array<int, 3> tmp_grad_v_strides;
-
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<const void*> grad_out_ptrs;
-  std::vector<const void*> out_ptrs;
-
-  // used by the light_v2 kernel
-  // TODO use these as workspace
-  std::vector<void*> ydotdy_ptrs;
-
-  uint8_t custom_mask_type;
-
-  std::vector<void*> grad_q_ptrs;
-  std::vector<void*> grad_k_ptrs;
-  std::vector<void*> grad_v_ptrs;
-  std::vector<void*> grad_bias_ptrs;
-
-  float dropout_prob;
-  int64_t philox_seed;
-  int64_t philox_offset;
-
-  // BHM mode lengths, completely contiguous
-  std::vector<const void*> logsumexp_ptrs;
-
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
index f97c8dd66..08825f1a8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_test.cpp
@@ -16,15 +16,6 @@ bool is_ck_fmha_available(double val) {
   return (true);
 };
 
-// For checking if ck-tiled kernel is used
-bool is_ck_tiled_used() {
-#if defined(USE_CK_TILED_KERNEL)
-  return (true);
-#else
-  return (false);
-#endif
-};
-
 } // namespace
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
@@ -33,9 +24,4 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::is_ck_fmha_available"),
       TORCH_FN(is_ck_fmha_available));
-
-  m.def(TORCH_SELECTIVE_SCHEMA("xformers::is_ck_tiled_used() -> bool"));
-  m.impl(
-      TORCH_SELECTIVE_NAME("xformers::is_ck_tiled_used"),
-      TORCH_FN(is_ck_tiled_used));
 }
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 509f83827..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 239204ad2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 06c4370ff..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index c5263f167..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 706bf4146..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 91aac31d9..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index c882648e5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index 5ce517a80..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 983538314..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 3202979ac..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 68b4d782a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index a7786f596..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 8205af6fa..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index b69fdda9b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 786b294ee..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index 8bebad6d1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 47bfbb6ba..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index b3efcb0f6..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 366a1be0b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index a1b19853c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include <stdexcept>
-
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index c764522f3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 53e93ab40..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 135932bb6..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index b36435a56..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_backward.h"
-
-template void run_batched_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 61a34f3bd..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 99ef697c7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 27d8f3389..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 9b81f64c1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 014b077e3..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 9a5b10848..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 52a38e71f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index b96463d83..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index dd4a8d4e2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 6fd666459..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index e2c25b131..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index daee90785..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_forward_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_batched_forward.h"
-
-template void run_batched_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index fae4e95db..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 3ea61a46a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index aa01129f8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 1596dbea9..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index d5a27c62a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index b47dcb485..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 2144a980e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 961a5b8f9..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 308adb597..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index dd24e182b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 590d032f1..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 1440164c7..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_batched_infer_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_batched_infer.h"
-
-template void run_batched_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index ced06186a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 9f61adfc9..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 2d4b51888..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index a49a8704c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index c2279d835..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 382bf0143..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 1b7549e3e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index f06694955..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 3a86c12f8..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index c287a283d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 6b06378dd..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index 13d1bc553..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_bp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 71cdf5b35..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 792f55e4d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 5776e856d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index d3f2eec10..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_0_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 27962589e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index fa837a65c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 7a83d4655..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index 807d23156..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_1_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 508d01882..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
deleted file mode 100644
index 5954578f2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_no_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 78482f931..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    false>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
deleted file mode 100644
index f38ea2ab2..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_backward_fp16_masktype_2_with_attnbias_fp32_grad.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_backward.h"
-
-template void run_grouped_backward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true,
-    true>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 3f6f0025b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 22918197f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index fffe1b188..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index b6020c099..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 16f780c9e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 28c1f0832..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 428b1b9ec..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 442e54a28..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index a8520501d..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 7a6075ab5..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index c93563491..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index dc1fbc96b..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_forward_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-#include "ck_fmha_grouped_forward.h"
-
-template void run_grouped_forward_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index 62ff93032..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index e3d2da2cc..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index 4d1f3c7f0..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index 170e8a56f..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index b615233aa..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index 2f1227b87..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_bp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::bhalf_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
deleted file mode 100644
index bb20cf780..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
deleted file mode 100644
index 509986e1c..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_0_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    0,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
deleted file mode 100644
index a53a0f485..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
deleted file mode 100644
index b35c58526..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_1_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    1,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
deleted file mode 100644
index 53e30115a..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_no_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    false>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
deleted file mode 100644
index d25650c8e..000000000
--- a/xformers/csrc/attention/hip_fmha/instances/ck_fmha_grouped_infer_fp16_masktype_2_with_attnbias.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_fmha_grouped_infer.h"
-
-template void run_grouped_infer_masktype_attnbias_dispatched<
-    ck::half_t,
-    2,
-    true>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index f43cb7905..aaca59113 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -149,14 +149,6 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
     return int(_CustomMaskType.NoCustomMask)
 
 
-# checking the availability of ck-tiled is necessary since ck-tiled does not
-# have the same functionalities as old-CK
-def is_ck_tiled() -> bool:
-    # ck_check_op is temporarily used to check ck-tiled availability
-    ck_check_op = get_xformers_operator("is_ck_tiled_used")
-    return ck_check_op()
-
-
 @register_operator
 class FwOp(AttentionFwOpBase):
     """xFormers' MHA kernel based on Composable Kernel."""
@@ -166,34 +158,22 @@ class FwOp(AttentionFwOpBase):
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 256
 
-    if is_ck_tiled():
-        SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-            type(None),
-            torch.Tensor,
-            LowerTriangularMask,
-            LowerTriangularFromBottomRightMask,
-            LowerTriangularFromBottomRightLocalAttentionMask,
-            LowerTriangularMaskWithTensorBias,
-            BlockDiagonalMask,
-            BlockDiagonalCausalMask,
-            BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            attn_bias.BlockDiagonalCausalLocalAttentionMask,
-            BlockDiagonalCausalLocalAttentionFromBottomRightMask,
-        }
-    else:
-        SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
-            type(None),
-            torch.Tensor,
-            LowerTriangularMask,
-            LowerTriangularMaskWithTensorBias,
-            BlockDiagonalMask,
-            BlockDiagonalCausalMask,
-            BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        }
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        torch.Tensor,
+        LowerTriangularMask,
+        LowerTriangularFromBottomRightMask,
+        LowerTriangularFromBottomRightLocalAttentionMask,
+        LowerTriangularMaskWithTensorBias,
+        BlockDiagonalMask,
+        BlockDiagonalCausalMask,
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        attn_bias.BlockDiagonalCausalFromBottomRightMask,
+        attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+    }
 
-    SUPPORTS_DROPOUT = False if is_ck_tiled() else True
+    SUPPORTS_DROPOUT = False
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
     SUPPORTS_BMGHK = True
@@ -216,8 +196,6 @@ class FwOp(AttentionFwOpBase):
         256,  # 64x128 with accumulation in gmem
     ]
 
-    IS_CK_TILED = is_ck_tiled()
-
     @classmethod
     def apply(
         cls, inp: Inputs, needs_gradient: bool
@@ -289,12 +267,6 @@ def apply_bmhk(
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
         seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
-        if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
-            seqlen_k = (
-                inp.attn_bias.k_seqinfo.seqlen
-                if is_ck_tiled()
-                else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
-            )
         out, lse, rng_seed, rng_offset = cls.OPERATOR(
             query=inp.query,
             key=inp.key,
@@ -307,19 +279,25 @@ def apply_bmhk(
             compute_logsumexp=needs_gradient,
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
-            seqlen_k=seqlen_k
-            if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
-            else None,
-            window_size=inp.attn_bias._window_size
-            if isinstance(
-                inp.attn_bias,
-                (
-                    BlockDiagonalCausalLocalAttentionMask,
-                    BlockDiagonalCausalLocalAttentionFromBottomRightMask,
-                    LowerTriangularFromBottomRightLocalAttentionMask,
-                ),
-            )
-            else None,
+            seqlen_k=(
+                inp.attn_bias.k_seqinfo.seqlen
+                if isinstance(
+                    inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask
+                )
+                else None
+            ),
+            window_size=(
+                inp.attn_bias._window_size
+                if isinstance(
+                    inp.attn_bias,
+                    (
+                        BlockDiagonalCausalLocalAttentionMask,
+                        BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                        LowerTriangularFromBottomRightLocalAttentionMask,
+                    ),
+                )
+                else None
+            ),
         )
 
         ctx: Optional[Context] = None
@@ -349,7 +327,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         requires_grad = (
             d.query.requires_grad or d.key.requires_grad or d.value.requires_grad
         )
-        if is_ck_tiled() and requires_grad:
+        if requires_grad:
             reasons.append("Gradience is currently not supported by ck-tiled!")
         return reasons
 
@@ -413,8 +391,6 @@ class BwOp(AttentionBwOpBase):
         256,  # 64x128 with accumulation in gmem
     ]
 
-    IS_CK_TILED = is_ck_tiled()
-
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(BwOp, cls).not_supported_reasons(d)
@@ -446,8 +422,8 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                     f"/ expected: {expected_bias_shape})"
                 )
         _check_large_shapes(reasons, d)
-        if is_ck_tiled():
-            reasons.append("Backward is currently not supported by ck-tiled!")
+
+        reasons.append("Backward is currently not supported by ck-tiled!")
         return reasons
 
     @classmethod
@@ -458,13 +434,6 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
-        if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
-            seqlen_k = (
-                inp.attn_bias.k_seqinfo.seqlen
-                if is_ck_tiled()
-                else inp.attn_bias.k_seqinfo.seqlen.to(torch.device("cpu"))
-            )
-
         rng_seed = rng_offset = 0
         if inp.p != 0.0:
             if (
@@ -485,9 +454,13 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
             max_seqlen_q=max_seqlen_q,
-            seqlen_k=seqlen_k
-            if isinstance(inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
-            else None,
+            seqlen_k=(
+                inp.attn_bias.k_seqinfo.seqlen
+                if isinstance(
+                    inp.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask
+                )
+                else None
+            ),
             logsumexp=ctx.lse,
             output=ctx.out.to(dtype),
             dropout_p=inp.p,

From 9e4582d653d32cb27125b55cab02915308af322a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 17:38:52 +0000
Subject: [PATCH 440/641] Remove old composable_kernel from submodule list

---
 .gitmodules | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index cbef796c7..635811410 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,10 +1,6 @@
 [submodule "third_party/cutlass"]
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
-[submodule "third_party/composable_kernel"]
-	path = third_party/composable_kernel
-	url = https://github.com/ROCm/composable_kernel.git
-	branch = mha-train-develop
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git

From 356cafd6a330567631e1fe881c3ff36296de619f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Feb 2024 17:45:43 +0000
Subject: [PATCH 441/641] Remove folder third_party/composable_kernel

---
 third_party/composable_kernel | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 third_party/composable_kernel

diff --git a/third_party/composable_kernel b/third_party/composable_kernel
deleted file mode 160000
index 719219b9f..000000000
--- a/third_party/composable_kernel
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 719219b9f1f4143e5fdd657dd16b704a22821766

From 79c554cdc3d1a0950ee98a5c0053b05c5ffa7466 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 8 Feb 2024 13:17:13 +0000
Subject: [PATCH 442/641] Rename the folder

---
 setup.py                                                        | 2 +-
 .../\\" => "xformers/csrc/attention/hip_fmha/instances/\\"      | 0
 ...tched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...tched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ...atched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ...atched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...hed_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...hed_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...hed_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...hed_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...ched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...ched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...d_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...d_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ed_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ed_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...tched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...tched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ...atched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ...atched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...hed_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...hed_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...hed_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...hed_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...ched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...ched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...d_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...d_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ed_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ed_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ..._batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ..._batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...tched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...tched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...atched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...atched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...tched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...tched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...atched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...atched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...hed_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...hed_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ..._batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ..._batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...tched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...tched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...atched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...atched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...tched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...tched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...atched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...atched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...hed_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...hed_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...ouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...ouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ...rouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ...rouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...ped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...ped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...uped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...uped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...ped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...ped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...uped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...uped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...d_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...d_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ed_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ed_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...ouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...ouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ...rouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ...rouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...ped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...ped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...uped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...uped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...ped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...ped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...uped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...uped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...d_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...d_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...ed_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...ed_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ..._grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ..._grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...ouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...ouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...rouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...rouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...ouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...ouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...rouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...rouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...ped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...ped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...uped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...uped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 ...grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp | 0
 ...grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp | 0
 ..._grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp | 0
 ..._grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp | 0
 ...ouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp | 0
 ...ouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp | 0
 ...rouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp | 0
 ...rouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp | 0
 ...ouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp | 0
 ...ouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp | 0
 ...rouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp | 0
 ...rouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp | 0
 ...ped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp | 0
 ...ped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp | 0
 ...uped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp | 0
 ...uped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp | 0
 130 files changed, 1 insertion(+), 1 deletion(-)
 rename "xformers/csrc/attention/hip_fmha/instances_tiled/\\" => "xformers/csrc/attention/hip_fmha/instances/\\" (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp (100%)
 rename xformers/csrc/attention/hip_fmha/{instances_tiled => instances}/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp (100%)

diff --git a/setup.py b/setup.py
index e1875123a..997853700 100644
--- a/setup.py
+++ b/setup.py
@@ -334,7 +334,7 @@ def get_extensions():
             extensions_dir,
             "attention",
             "hip_fmha",
-            "instances_tiled",
+            "instances",
             "ck_tiled_fmha_*.cpp",
         ),
         recursive=False,
diff --git "a/xformers/csrc/attention/hip_fmha/instances_tiled/\\" "b/xformers/csrc/attention/hip_fmha/instances/\\"
similarity index 100%
rename from "xformers/csrc/attention/hip_fmha/instances_tiled/\\"
rename to "xformers/csrc/attention/hip_fmha/instances/\\"
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances_tiled/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp

From 2be6c04d80e1d6d9f875d3b27ad5059c9afbcb28 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 8 Feb 2024 21:36:17 +0000
Subject: [PATCH 443/641] Remove unused script file

---
 tests/test_ck_7.py | 875 ---------------------------------------------
 1 file changed, 875 deletions(-)
 delete mode 100644 tests/test_ck_7.py

diff --git a/tests/test_ck_7.py b/tests/test_ck_7.py
deleted file mode 100644
index 7477c3f70..000000000
--- a/tests/test_ck_7.py
+++ /dev/null
@@ -1,875 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
-
-import pytest
-import torch
-
-import xformers.ops
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-
-from .utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-_types = [torch.float16, torch.bfloat16]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
-    fmha.ck.BwOp,
-]
-
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    for B in op._TEST_BATCH_SIZES:
-        for Mq in [32, 256]:
-            for Mkv in [32, 64, 256, 1024]:
-                for K in op._TEST_K:
-                    shapes.append((B, Mq, Mkv, 1, K, K))
-        Mq = 256
-        Mkv = 128
-        K = 32
-        H = 1
-        # Weird values of parameters
-        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
-            shapes.append((B, M, Mkv, H, K, K))
-            shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
-            if _K <= op.SUPPORTED_MAX_K:
-                shapes.append((B, Mq, Mkv, H, _K, _K))
-        # Different value for K / Kv
-        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-            for _K in [32, 36, 64, 256 + 8]:
-                shapes.append((B, Mq, Mkv, H, K, _K))
-                shapes.append((B, Mq, Mkv, H, _K, K))
-        # Exotic sizes
-        for K in op._TEST_K:
-            shapes.append((B, 16, 1024, H, K, K))
-            shapes.append((B, 1024, 16, H, K, K))
-        # Some number of heads
-        for H in [3, 5, 12]:
-            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
-    # Filter-out not supported shapes
-    shapes = [
-        shape
-        for shape in shapes
-        if len(
-            op.shape_not_supported_reasons(
-                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
-            )
-        )
-        == 0
-    ]
-    # Add some random shapes
-    if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        found_count = 0
-        while found_count < 20:
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
-                continue
-            found_count += 1
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def make_id(op, device, dtype, bias_type, *shape):
-    return (
-        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-        f"-{'-'.join([str(s) for s in shape])}"
-    )
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-    return {
-        "argvalues": combination,
-        "ids": [make_id(*c) for c in combination],
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
-)
-
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            #
-            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        if fmt == "BMK":
-            attn_bias = attn_bias[:, 0]
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
-def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
-    tensor_with_grad: Optional[torch.Tensor] = None
-    if isinstance(attn_bias, torch.Tensor):
-        tensor_with_grad = attn_bias
-    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-        tensor_with_grad = attn_bias._bias
-    if tensor_with_grad is not None:
-        grad = tensor_with_grad.grad
-        if clear:
-            tensor_with_grad.grad = None
-        return grad
-    return None
-
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("packed", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed,
-    fmt,
-):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
-    )
-
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        else:
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-        assert not query.is_contiguous()
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@pytest.mark.parametrize("k_len", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 4])
-@pytest.mark.parametrize("kv_len", [128, 512])
-@pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("device", [torch.device("cuda")])
-@pytest.mark.parametrize("dtype", _types)
-def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
-    scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
-    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, op=(fmha.ck.FwOp, None)
-    )
-    # this should be equivalent to the average over value
-    ref = value.mean(1, keepdim=True).expand_as(query)
-
-    if dtype is torch.float16:
-        assert_allclose(out, ref, atol=1e-5)
-    else:
-        assert_allclose(out, ref, atol=1e-2)
-
-
-def _block_diag_reshape_lse(
-    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
-) -> torch.Tensor:
-    """LSE can be padded, let's remove the padding"""
-    parts = []
-    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
-        parts.append(slice[:, : end - start])
-    return torch.cat(parts, dim=1).unsqueeze(1)
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
-    )
-
-    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query,
-        key,
-        value,
-        op=op,
-        attn_bias=attn_bias,
-    )
-    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            tensor_bias = attn_bias.materialize(
-                (query.shape[0], 1, query.shape[1], key.shape[1]),
-                device=query.device,
-                dtype=torch.float32,
-            )
-        else:
-            assert isinstance(attn_bias, torch.Tensor)
-            tensor_bias = attn_bias
-        if tensor_bias.ndim == 4:
-            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
-        attn = attn + tensor_bias.float()
-    ref_lse = attn.logsumexp(-1)
-    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
-        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
-    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("grad_out_contiguous", [True])
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_backward(
-    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    grad_out_contiguous,
-    fmt,
-):
-    (
-        op_bw,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if k > 128 or kv > 128:
-        pytest.skip(
-            "head-dim length bigger than 128 is not supported by CK-FlashAttention-1"
-        )
-
-    if k % 8 != 0 or kv % 8 != 0:
-        pytest.skip("head-dim length must be an even value for CK-FlashAttention-1")
-
-    # BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
-    if (
-        bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-        and q_len <= kv_len
-    ):
-        pytest.skip(
-            "BlockDiagonalCausalFromBottomRightMask requires kv_len bigger than q_len"
-        )
-
-    if k != kv:
-        pytest.skip("k same as kv is not well tested by CK-FlashAttention-1")
-
-    # attn_bias_requires_grad = (
-    #    random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
-    # )
-    attn_bias_requires_grad = False
-
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        attn_bias_requires_grad=attn_bias_requires_grad,
-        fmt=fmt,
-    )
-    op_fw = (
-        sample_random_supported_fw(
-            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
-            seed=q_len * kv + kv_len * k,
-        )
-        if op_bw != fmha.ck.BwOp
-        else fmha.ck.FwOp
-    )
-    qkv = None
-
-    if (
-        fmt == "BMHK"
-        and query.shape[3] == value.shape[3]
-        and query.shape[1] == value.shape[1]
-    ):
-        qkv = torch.stack([query, key, value], 2)
-        qkv.requires_grad_(True)
-        # bm3hk -> 3 x bmhk
-        query, key, value = xformers.ops.unbind(qkv, 2)
-        assert not query.is_contiguous()
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
-        pytest.skip("inputs not supported")
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, op=(op_fw, op_bw)
-    )
-
-    grad_out = torch.ones_like(out)
-    # if grad_out_contiguous is False:
-    #    grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
-    #        None, None, :
-    #    ].expand_as(out)
-
-    out.backward(grad_out)
-
-    if qkv is None and op_bw == fmha.ck.BwOp:
-        assert query.stride() == query.grad.stride()
-
-    grads = []
-    if qkv is None:
-        grads = [query.grad, key.grad, value.grad]
-        query.grad = None
-        key.grad = None
-        value.grad = None
-    else:
-        grads = [qkv.grad]
-        qkv.grad = None
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
-        if attn_bias_grad is not None:
-            grads.append(attn_bias_grad)
-
-    ref = ref_attention(query, key, value, attn_bias)
-    ref.backward(grad_out)
-
-    assert_allclose(
-        out.float(),
-        ref.float(),
-        "fw pass",
-        atol=op_fw.ERROR_ATOL[dtype],
-        rtol=op_fw.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-    del out
-    del grad_out
-    del ref
-
-    atol = op_bw.ERROR_ATOL[dtype]
-    rtol = op_bw.ERROR_RTOL[dtype]
-
-    grads_ref = []
-    grads_name = []
-    if qkv is None:
-        assert isinstance(query.grad, torch.Tensor)
-        assert isinstance(key.grad, torch.Tensor)
-        assert isinstance(value.grad, torch.Tensor)
-        grads_ref = [query.grad, key.grad, value.grad]
-        grads_name = ["query", "key", "value"]
-    else:
-        assert isinstance(qkv.grad, torch.Tensor)
-        grads_ref = [qkv.grad]
-        grads_name = ["qkv"]
-
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias)
-        if attn_bias_grad is not None:
-            grads_ref.append(attn_bias.grad)
-            grads_name.append("bias")
-
-    del query
-    del key
-    del value
-    del qkv
-
-    assert len(grads_ref) == len(
-        grads
-    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
-    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
-        assert_allclose(
-            calc_grad,
-            ref_grad,
-            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
-            atol=atol,
-            rtol=rtol,
-        )

From 61d875afbb1224b17a586b63ca6d5631dc875e97 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 9 Feb 2024 00:01:59 +0000
Subject: [PATCH 444/641] apply black

---
 xformers/benchmarks/benchmark_attn_decoding.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 3c30e5702..19c34bb8f 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -151,9 +151,9 @@ def fw(self) -> None:
                 v = v[:, :, :, 0]
             return flash_attn.flash_attn_func(q, k, v)
 
-    BENCHMARKS[f"flash-attention@{flash_attn.__version__}"] = (
-        AttentionDecodingFlashAttention
-    )
+    BENCHMARKS[
+        f"flash-attention@{flash_attn.__version__}"
+    ] = AttentionDecodingFlashAttention
 except ImportError:
     pass
 

From 4616121bddf77b183c78b3d8b7bbdf17a58285a9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 9 Feb 2024 00:08:30 +0000
Subject: [PATCH 445/641] pacify mypy

---
 xformers/ops/fmha/ck_decoder.py | 3 ++-
 xformers/ops/fmha/ck_splitk.py  | 3 ++-
 xformers/ops/fmha/triton.py     | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 0da84d441..cd61f18a7 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -93,6 +93,7 @@ def apply(
         attn_bias = inp.attn_bias
         q, k, v = inp.get_qkv_in_bmghk()
         if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             attn_bias.k_seqinfo.to(k.device)
             attn_bias.q_seqinfo.to(q.device)
             padding = attn_bias.k_seqinfo.padding
@@ -124,7 +125,7 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = torch.rsqrt(torch.tensor(key.shape[-1], dtype=torch.float32))
+            qk_scale = torch.rsqrt(torch.tensor(key.shape[-1], dtype=torch.float32)).item()
 
         out = cls.OPERATOR(
             query=query,
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 249edd533..6d0fce22e 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -111,6 +111,7 @@ def apply(
         q, k, v = inp.get_qkv_in_bmghk()
 
         if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             attn_bias.k_seqinfo.to(k.device)
             attn_bias.q_seqinfo.to(q.device)
             padding = attn_bias.k_seqinfo.padding
@@ -151,7 +152,7 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32))
+            qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32)).item()
 
         out = cls.OPERATOR(
             query=query,
diff --git a/xformers/ops/fmha/triton.py b/xformers/ops/fmha/triton.py
index 08018f56f..a8995c94c 100644
--- a/xformers/ops/fmha/triton.py
+++ b/xformers/ops/fmha/triton.py
@@ -565,6 +565,10 @@ def apply(
             # q ~ [1, B*T, H, K]
             # TODO: do we really need to do this cast? seems fishy but
             # I just copied it from the split-k kernel
+            assert isinstance(
+                attn_bias,
+                (BlockDiagonalCausalWithOffsetPaddedKeysMask, BlockDiagonalCausalMask),
+            )
             attn_bias.k_seqinfo.to(inp.query.device)
             attn_bias.q_seqinfo.to(inp.query.device)
             seqstart_q = attn_bias.q_seqinfo.seqstart

From 832e223d2e85910d2068566f30083e6729bf7cea Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 9 Feb 2024 00:10:05 +0000
Subject: [PATCH 446/641] fix clang-format

---
 .../hip_fmha/attention_forward_decoder.cpp    |  6 +--
 .../hip_fmha/attention_forward_splitk.cpp     | 38 +++++++++----------
 .../hip_fmha/ck_attention_forward_decoder.h   | 10 ++---
 .../ck_attention_forward_decoder_splitk.h     | 32 ++++++++--------
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |  2 +-
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  5 ++-
 6 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 6fe0137b0..786dfec0b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -270,9 +270,9 @@ int main(int argc, char** argv) {
     const int32_t n_heads = std::stoi(args[3]);
     const int32_t n_groups = 1;
     const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32")
-        ? torch::kFloat32
-        : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[5] == "f32") ? torch::kFloat32
+        : (args[5] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[6]);
 
     const int32_t dim_per_head = 4 * kThreadsPerWavefront;
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index a7ddb148c..06fbbe0f6 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -543,14 +543,14 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -708,14 +708,14 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : O_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                        scalar_t,
-                        2>
-                  : O_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    2>
+              : O_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
@@ -1095,9 +1095,9 @@ int main(int argc, char** argv) {
     const int32_t batch_size = std::stoi(args[1]);
     const int32_t nq_heads = std::stoi(args[2]);
     const int32_t nkv_heads = std::stoi(args[3]);
-    const auto dtype = (args[4] == "f32")
-        ? torch::kFloat32
-        : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[4] == "f32") ? torch::kFloat32
+        : (args[4] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[5]);
 
     auto [Q, K, V, seq] =
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 57d54eda2..20b3b8979 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -458,12 +458,10 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           Q_size_k_alignment_necessary == 4
               ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index acb1a0154..9eed4f001 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -613,14 +613,14 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -659,14 +659,14 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 78c62cfa3..58abc9efa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -53,7 +53,7 @@ struct FmhaFwdKernel {
 
   template <ck::index_t I> // to avoid duplicated base class prblem, introduce
                            // an template arg
-                           struct FmhaFwdEmptyKargs {};
+  struct FmhaFwdEmptyKargs {};
 
   // kargs use aggregate initializer, so no constructor will provided
   // use inheritance to minimize karg size
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 33eb580c1..626857121 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -73,8 +73,9 @@ struct grouped_forward_causalmask_attnbias_dispatched {
 
       using FmhaShape = FmhaFwdShape<HDim>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-      constexpr ck::index_t occupancy =
-          (HDim == 64) ? 3 : (HDim == 256) ? 1 : 2;
+      constexpr ck::index_t occupancy = (HDim == 64) ? 3
+          : (HDim == 256)                            ? 1
+                                                     : 2;
 
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;

From 2b2967ed3d0f6acc1dc034d2328a8a2eae31b4c8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 9 Feb 2024 00:14:22 +0000
Subject: [PATCH 447/641] reapply black

---
 xformers/ops/fmha/ck_decoder.py | 4 +++-
 xformers/ops/fmha/ck_splitk.py  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index cd61f18a7..dfbbd581f 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -125,7 +125,9 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = torch.rsqrt(torch.tensor(key.shape[-1], dtype=torch.float32)).item()
+            qk_scale = torch.rsqrt(
+                torch.tensor(key.shape[-1], dtype=torch.float32)
+            ).item()
 
         out = cls.OPERATOR(
             query=query,
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 6d0fce22e..3d37dcdf1 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -152,7 +152,9 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32)).item()
+            qk_scale = torch.rsqrt(
+                torch.tensor(k.shape[-1], dtype=torch.float32)
+            ).item()
 
         out = cls.OPERATOR(
             query=query,

From 3c9d4e51282d71998ac94c771a3a6cd0c57b4581 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 01:15:20 +0000
Subject: [PATCH 448/641] fix lints

---
 .../hip_fmha/attention_forward_decoder.cpp    |  6 +--
 .../hip_fmha/attention_forward_splitk.cpp     | 38 +++++++++----------
 .../hip_fmha/ck_attention_forward_decoder.h   | 10 ++---
 .../ck_attention_forward_decoder_splitk.h     | 32 ++++++++--------
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   |  2 +-
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  5 ++-
 xformers/ops/fmha/ck_decoder.py               |  5 ++-
 xformers/ops/fmha/ck_splitk.py                |  5 ++-
 xformers/ops/fmha/triton.py                   |  5 +--
 9 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 6fe0137b0..786dfec0b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -270,9 +270,9 @@ int main(int argc, char** argv) {
     const int32_t n_heads = std::stoi(args[3]);
     const int32_t n_groups = 1;
     const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32")
-        ? torch::kFloat32
-        : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[5] == "f32") ? torch::kFloat32
+        : (args[5] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[6]);
 
     const int32_t dim_per_head = 4 * kThreadsPerWavefront;
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index a7ddb148c..06fbbe0f6 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -543,14 +543,14 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -708,14 +708,14 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : O_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                        scalar_t,
-                        2>
-                  : O_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    2>
+              : O_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
@@ -1095,9 +1095,9 @@ int main(int argc, char** argv) {
     const int32_t batch_size = std::stoi(args[1]);
     const int32_t nq_heads = std::stoi(args[2]);
     const int32_t nkv_heads = std::stoi(args[3]);
-    const auto dtype = (args[4] == "f32")
-        ? torch::kFloat32
-        : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[4] == "f32") ? torch::kFloat32
+        : (args[4] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[5]);
 
     auto [Q, K, V, seq] =
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 57d54eda2..20b3b8979 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -458,12 +458,10 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           Q_size_k_alignment_necessary == 4
               ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index acb1a0154..9eed4f001 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -613,14 +613,14 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -659,14 +659,14 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
index 78c62cfa3..58abc9efa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
@@ -53,7 +53,7 @@ struct FmhaFwdKernel {
 
   template <ck::index_t I> // to avoid duplicated base class prblem, introduce
                            // an template arg
-                           struct FmhaFwdEmptyKargs {};
+  struct FmhaFwdEmptyKargs {};
 
   // kargs use aggregate initializer, so no constructor will provided
   // use inheritance to minimize karg size
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 33eb580c1..626857121 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -73,8 +73,9 @@ struct grouped_forward_causalmask_attnbias_dispatched {
 
       using FmhaShape = FmhaFwdShape<HDim>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-      constexpr ck::index_t occupancy =
-          (HDim == 64) ? 3 : (HDim == 256) ? 1 : 2;
+      constexpr ck::index_t occupancy = (HDim == 64) ? 3
+          : (HDim == 256)                            ? 1
+                                                     : 2;
 
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index 0da84d441..dfbbd581f 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -93,6 +93,7 @@ def apply(
         attn_bias = inp.attn_bias
         q, k, v = inp.get_qkv_in_bmghk()
         if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             attn_bias.k_seqinfo.to(k.device)
             attn_bias.q_seqinfo.to(q.device)
             padding = attn_bias.k_seqinfo.padding
@@ -124,7 +125,9 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = torch.rsqrt(torch.tensor(key.shape[-1], dtype=torch.float32))
+            qk_scale = torch.rsqrt(
+                torch.tensor(key.shape[-1], dtype=torch.float32)
+            ).item()
 
         out = cls.OPERATOR(
             query=query,
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 249edd533..3d37dcdf1 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -111,6 +111,7 @@ def apply(
         q, k, v = inp.get_qkv_in_bmghk()
 
         if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
             attn_bias.k_seqinfo.to(k.device)
             attn_bias.q_seqinfo.to(q.device)
             padding = attn_bias.k_seqinfo.padding
@@ -151,7 +152,9 @@ def apply(
         if inp.scale is not None:
             qk_scale = inp.scale
         else:
-            qk_scale = torch.rsqrt(torch.tensor(k.shape[-1], dtype=torch.float32))
+            qk_scale = torch.rsqrt(
+                torch.tensor(k.shape[-1], dtype=torch.float32)
+            ).item()
 
         out = cls.OPERATOR(
             query=query,
diff --git a/xformers/ops/fmha/triton.py b/xformers/ops/fmha/triton.py
index 08018f56f..f2a538ac4 100644
--- a/xformers/ops/fmha/triton.py
+++ b/xformers/ops/fmha/triton.py
@@ -557,11 +557,10 @@ def apply(
         k = inp.key
         v = inp.value
 
-        is_bt_h_m = isinstance(
+        if isinstance(
             attn_bias,
             (BlockDiagonalCausalWithOffsetPaddedKeysMask, BlockDiagonalCausalMask),
-        )
-        if is_bt_h_m:
+        ):
             # q ~ [1, B*T, H, K]
             # TODO: do we really need to do this cast? seems fishy but
             # I just copied it from the split-k kernel

From 1d474c527b4ab73bdca645e0524a0efe2a4d15f8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 03:26:00 +0000
Subject: [PATCH 449/641] make test_splitk_reference run on cpu

---
 tests/test_mem_eff_attention.py                | 17 ++++++++++++-----
 xformers/benchmarks/benchmark_attn_decoding.py |  6 +++---
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index a77cc43af..13a168795 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -1868,8 +1868,15 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
 @pytest.mark.parametrize("n_heads", [16])
 @pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
 @pytest.mark.parametrize("split_k", [1, 2, 4])
+@pytest.mark.parametrize("device", ["cpu"])
 def test_splitk_reference(
-    kv_heads: int, n_heads: int, padding: int, bsz: int, dtype: str, split_k: int
+    kv_heads: int,
+    n_heads: int,
+    padding: int,
+    bsz: int,
+    dtype: str,
+    device: str,
+    split_k: int,
 ):
     dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
     torch.manual_seed(1)
@@ -1888,13 +1895,13 @@ def test_splitk_reference(
         k_shape = (1, bsz * padding, n_heads, d)
         q_shape = (1, bsz * num_queries, n_heads, d)
 
-    k = torch.rand(k_shape, dtype=dtype_).cuda()
+    k = torch.rand(k_shape, dtype=dtype_, device=device)
     k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
     v = torch.rand_like(k)
-    q = torch.rand(q_shape, dtype=dtype_).cuda()
+    q = torch.rand(q_shape, dtype=dtype_, device=device)
     causal_diagonal = torch.tensor(  # TODO: make unnecessary
-        [i - 1 for i in k_seqlen], dtype=torch.int32
-    ).cuda()
+        [i - 1 for i in k_seqlen], dtype=torch.int32, device=device
+    )
 
     if kv_heads is not None:
         k = k[..., :1, :].expand(k_shape)
diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 3c30e5702..19c34bb8f 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -151,9 +151,9 @@ def fw(self) -> None:
                 v = v[:, :, :, 0]
             return flash_attn.flash_attn_func(q, k, v)
 
-    BENCHMARKS[f"flash-attention@{flash_attn.__version__}"] = (
-        AttentionDecodingFlashAttention
-    )
+    BENCHMARKS[
+        f"flash-attention@{flash_attn.__version__}"
+    ] = AttentionDecodingFlashAttention
 except ImportError:
     pass
 

From d38a6843ce3bd5a3d7cdab38cc556747c9804011 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 04:20:17 +0000
Subject: [PATCH 450/641] add ck modules to docs

---
 docs/source/components/ops.rst | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/source/components/ops.rst b/docs/source/components/ops.rst
index 5f98fdcb5..09dc0d25c 100644
--- a/docs/source/components/ops.rst
+++ b/docs/source/components/ops.rst
@@ -22,13 +22,25 @@ Available implementations
     :member-order: bysource
 
 .. automodule:: xformers.ops.fmha.triton
-    :members: FwOp, BwOp
+    :members: FwOp
     :member-order: bysource
 
 .. automodule:: xformers.ops.fmha.small_k
     :members: FwOp, BwOp
     :member-order: bysource
 
+.. automodule:: xformers.ops.fmha.ck
+    :members: FwOp, BwOp
+    :member-order: bysource
+
+.. automodule:: xformers.ops.fmha.ck_decoder
+    :members: FwOp
+    :member-order: bysource
+
+.. automodule:: xformers.ops.fmha.ck_splitk
+    :members: FwOp
+    :member-order: bysource
+
 Attention biases
 ~~~~~~~~~~~~~~~~~~~~
 

From eccbf5450192a9113816b11a46d5d172cfcf9ded Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 21:09:42 +0000
Subject: [PATCH 451/641] try fixing nvidia build by re-including sparse24 cpp
 folder into extension sources

---
 setup.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/setup.py b/setup.py
index 997853700..6b4ba8b19 100644
--- a/setup.py
+++ b/setup.py
@@ -245,6 +245,9 @@ def get_extensions():
     sources += glob.glob(
         os.path.join(extensions_dir, "swiglu", "**", "*.cpp"), recursive=True
     )
+    sources += glob.glob(
+        os.path.join(extensions_dir, "sparse24", "**", "*.cpp"), recursive=True
+    )
 
     # avoid the temporary .cu file under xformers/csrc/attention/hip_fmha are included
     source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"), recursive=False)
@@ -257,6 +260,9 @@ def get_extensions():
     source_cuda += glob.glob(
         os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True
     )
+    source_cuda += glob.glob(
+        os.path.join(extensions_dir, "sparse24", "**", "*.cu"), recursive=True
+    )
 
     source_hip = glob.glob(
         os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"),

From 1ef6c20c6219b3d0e3c29930917e04cb0d3663f5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 21:46:29 +0000
Subject: [PATCH 452/641] update cutlass to upstream commit

---
 third_party/cutlass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cutlass b/third_party/cutlass
index 66d9cddc8..e0aaa3c3b 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit 66d9cddc832c1cdc2b30a8755274f7f74640cfe6
+Subproject commit e0aaa3c3b38db9a89c31f04fef91e92123ad5e2e

From 9dfec0de65e93957553793104f17832e6ba47987 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 22:12:39 +0000
Subject: [PATCH 453/641] update flash-attention to upstream commit

---
 third_party/flash-attention | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/flash-attention b/third_party/flash-attention
index 9e5e8bc91..92dd5703e 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit 9e5e8bc91e30af5cdc321362b553f6c0da332e30
+Subproject commit 92dd5703ecdb99aa4a4aee9817f28557907403a2

From 9fcda18d96cc38be34eea0c55ceacb6a06ab9e7a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 22:20:53 +0000
Subject: [PATCH 454/641] simplify setup.py

---
 setup.py | 125 +++++--------------------------------------------------
 1 file changed, 10 insertions(+), 115 deletions(-)

diff --git a/setup.py b/setup.py
index 6b4ba8b19..9a59f5fd1 100644
--- a/setup.py
+++ b/setup.py
@@ -229,124 +229,19 @@ def rename_cpp_cu(cpp_files):
 def get_extensions():
     extensions_dir = os.path.join("xformers", "csrc")
 
-    sources = glob.glob(
-        os.path.join(extensions_dir, "attention", "*.cpp"), recursive=False
-    )
-    sources += glob.glob(
-        os.path.join(extensions_dir, "attention", "autograd", "**", "*.cpp"),
-        recursive=True,
-    )
-    sources += glob.glob(
-        os.path.join(extensions_dir, "attention", "cpu", "**", "*.cpp"), recursive=True
-    )
-    sources += glob.glob(
-        os.path.join(extensions_dir, "indexing", "**", "*.cpp"), recursive=True
-    )
-    sources += glob.glob(
-        os.path.join(extensions_dir, "swiglu", "**", "*.cpp"), recursive=True
-    )
-    sources += glob.glob(
-        os.path.join(extensions_dir, "sparse24", "**", "*.cpp"), recursive=True
-    )
-
-    # avoid the temporary .cu file under xformers/csrc/attention/hip_fmha are included
-    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"), recursive=False)
-    source_cuda += glob.glob(
-        os.path.join(extensions_dir, "attention", "cuda", "**", "*.cu"), recursive=True
-    )
-    source_cuda += glob.glob(
-        os.path.join(extensions_dir, "indexing", "**", "*.cu"), recursive=True
-    )
-    source_cuda += glob.glob(
-        os.path.join(extensions_dir, "swiglu", "**", "*.cu"), recursive=True
-    )
-    source_cuda += glob.glob(
-        os.path.join(extensions_dir, "sparse24", "**", "*.cu"), recursive=True
-    )
-
+    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"), recursive=True)
+    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu"), recursive=True)
     source_hip = glob.glob(
-        os.path.join(extensions_dir, "attention", "hip_fmha", "ck_fmha_test.cpp"),
-        recursive=False,
-    )
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"
-        ),
-        recursive=False,
-    )
-
-    source_hip_decoder = [
-        *glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "attention_forward_decoder.cpp"
-            ),
-            recursive=False,
-        ),
-        *glob.glob(
-            os.path.join(
-                extensions_dir, "attention", "hip_fmha", "attention_forward_splitk.cpp"
-            ),
-            recursive=False,
-        ),
-    ]
-
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir,
-            "attention",
-            "hip_fmha",
-            "attention_forward_generic_ck_tiled.cpp",
-        ),
-        recursive=False,
-    )
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir,
-            "attention",
-            "hip_fmha",
-            "ck_tiled_fmha_batched_infer_*.cpp",
-        ),
-        recursive=False,
-    )
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir,
-            "attention",
-            "hip_fmha",
-            "ck_tiled_fmha_grouped_infer_*.cpp",
-        ),
-        recursive=False,
-    )
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir,
-            "attention",
-            "hip_fmha",
-            "ck_tiled_fmha_batched_forward_*.cpp",
-        ),
-        recursive=False,
-    )
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir,
-            "attention",
-            "hip_fmha",
-            "ck_tiled_fmha_grouped_forward_*.cpp",
-        ),
-        recursive=False,
+        os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cpp"),
+        recursive=True,
     )
-    source_hip += glob.glob(
-        os.path.join(
-            extensions_dir,
-            "attention",
-            "hip_fmha",
-            "instances",
-            "ck_tiled_fmha_*.cpp",
-        ),
-        recursive=False,
+    source_hip_generated = glob.glob(
+        os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cu"),
+        recursive=True,
     )
-
-    source_hip += source_hip_decoder
+    # avoid the temporary .cu files generated under xformers/csrc/attention/hip_fmha
+    source_cuda = list(set(source_cuda) - set(source_hip_generated))
+    sources = list(set(sources) - set(source_hip))
 
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")

From 58d38d411070bd716fb46605c5b44bed33abfcd0 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 13 Feb 2024 23:52:27 +0000
Subject: [PATCH 455/641] remove duplicate
 run_batched_infer_causalmask_attnbias_dispatched<f16, true, true, 128>

---
 "xformers/csrc/attention/hip_fmha/instances/\\" | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 "xformers/csrc/attention/hip_fmha/instances/\\"

diff --git "a/xformers/csrc/attention/hip_fmha/instances/\\" "b/xformers/csrc/attention/hip_fmha/instances/\\"
deleted file mode 100644
index e7f76cd58..000000000
--- "a/xformers/csrc/attention/hip_fmha/instances/\\"
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <ck/ck.hpp>
-
-#include "ck_tiled_fmha_batched_infer.h"
-
-template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);

From 07183f0c7516e9a80aa51d504c5ff59287f0f6ab Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 14 Feb 2024 00:52:39 +0000
Subject: [PATCH 456/641] add hip version and pytorch hip arch list to xformers
 build info

---
 setup.py             | 16 ++++++++++++++++
 xformers/_cpp_lib.py |  4 ++++
 xformers/info.py     |  1 +
 3 files changed, 21 insertions(+)

diff --git a/setup.py b/setup.py
index 9a59f5fd1..0fad35ad1 100644
--- a/setup.py
+++ b/setup.py
@@ -125,6 +125,17 @@ def get_cuda_version(cuda_dir) -> int:
     return bare_metal_major * 100 + bare_metal_minor
 
 
+def get_hip_version(rocm_dir) -> str:
+    hipcc_bin = "hipcc" if rocm_dir is None else os.path.join(rocm_dir, "bin", "hipcc")
+    raw_output = subprocess.check_output(
+        [hipcc_bin, "--version"], universal_newlines=True
+    )
+    for line in raw_output.split("\n"):
+        if "HIP version" in line:
+            return line.split()[-1]
+    return None
+
+
 def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
     # XXX: Not supported on windows for cuda<12
     # https://github.com/Dao-AILab/flash-attention/issues/345
@@ -323,6 +334,9 @@ def get_extensions():
         ]
     elif torch.cuda.is_available() and torch.version.hip:
         rename_cpp_cu(source_hip)
+        rocm_home = os.getenv("ROCM_PATH")
+        hip_version = get_hip_version(rocm_home)
+
         source_hip_cu = []
         for ff in source_hip:
             source_hip_cu += [ff.replace(".cpp", ".cu")]
@@ -368,6 +382,7 @@ def get_extensions():
     return ext_modules, {
         "version": {
             "cuda": cuda_version,
+            "hip": hip_version,
             "torch": torch.__version__,
             "python": platform.python_version(),
             "flash": flash_version,
@@ -376,6 +391,7 @@ def get_extensions():
             k: os.environ.get(k)
             for k in [
                 "TORCH_CUDA_ARCH_LIST",
+                "PYTORCH_ROCM_ARCH",
                 "XFORMERS_BUILD_TYPE",
                 "XFORMERS_ENABLE_DEBUG_ASSERTIONS",
                 "NVCC_FLAGS",
diff --git a/xformers/_cpp_lib.py b/xformers/_cpp_lib.py
index 4eb6fd981..d5d011700 100644
--- a/xformers/_cpp_lib.py
+++ b/xformers/_cpp_lib.py
@@ -27,6 +27,10 @@ class _BuildInfo:
     def cuda_version(self) -> Optional[int]:
         return self.metadata["version"]["cuda"]
 
+    @property
+    def hip_version(self) -> Optional[int]:
+        return self.metadata["version"]["hip"]
+
     @property
     def torch_version(self) -> str:
         return self.metadata["version"]["torch"]
diff --git a/xformers/info.py b/xformers/info.py
index 1a17586e6..af0fa5b2f 100644
--- a/xformers/info.py
+++ b/xformers/info.py
@@ -49,6 +49,7 @@ def print_info():
     if build_info is not None:
         features["build.info"] = "available"
         features["build.cuda_version"] = build_info.cuda_version
+        features["build.hip_version"] = build_info.hip_version
         features["build.python_version"] = build_info.python_version
         features["build.torch_version"] = build_info.torch_version
         for k, v in build_info.build_env.items():

From 993a90c5d7ac54446b5cf702673e2056c3a4831c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 14 Feb 2024 01:05:48 +0000
Subject: [PATCH 457/641] fix build

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 0fad35ad1..d5ca4af69 100644
--- a/setup.py
+++ b/setup.py
@@ -278,6 +278,7 @@ def get_extensions():
     include_dirs = [extensions_dir]
     ext_modules = []
     cuda_version = None
+    hip_version = None
     flash_version = "0.0.0"
 
     if (

From d4a374bd6ad4256cf27dd9fe2b979ffc13d75673 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 14 Feb 2024 01:58:37 +0000
Subject: [PATCH 458/641] patch around the unhappy path in get_hip_version

---
 setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index d5ca4af69..e44d58509 100644
--- a/setup.py
+++ b/setup.py
@@ -127,9 +127,13 @@ def get_cuda_version(cuda_dir) -> int:
 
 def get_hip_version(rocm_dir) -> str:
     hipcc_bin = "hipcc" if rocm_dir is None else os.path.join(rocm_dir, "bin", "hipcc")
-    raw_output = subprocess.check_output(
-        [hipcc_bin, "--version"], universal_newlines=True
-    )
+    try:
+        raw_output = subprocess.check_output(
+            [hipcc_bin, "--version"], universal_newlines=True
+        )
+    except Exception as e:
+        print(f"hip installation not found: {e} ROCM_PATH={os.environ.get('ROCM_PATH')}")
+        return None
     for line in raw_output.split("\n"):
         if "HIP version" in line:
             return line.split()[-1]

From ff59f1933c52327da4e5178b68948beca2159c92 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 19:09:17 +0000
Subject: [PATCH 459/641] skip test_grad_checkpointing for triton_splitk since
 it doesn't have bwop

---
 tests/test_mem_eff_attention.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 13a168795..cf49f58b0 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -1500,13 +1500,10 @@ def test_grad_checkpointing(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     if op is fmha.triton.FwOp:
         pytest.skip("Triton Flash Attention 2 doesn't support backward pass yet")
+    if op is fmha.triton_splitk.FwOp:
+        pytest.skip("Triton Flash Decoding doesn't support backward pass yet")
     if op is fmha.ck.FwOp:
         pytest.skip("ck-tiled FMHA doesn't supported backward pass yet")
-    if op is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
 
     bias_type = None
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = (

From 81bcfd5357fc799b8dfd67878f2bcfde372a6742 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 19:15:22 +0000
Subject: [PATCH 460/641] re-enable test_mqa_forward since ck tiled is the
 current implementation

---
 tests/test_mem_eff_attention.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index cf49f58b0..8c7c10fba 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -745,9 +745,6 @@ def test_mqa_forward(
 
     device = torch.device("cuda")
 
-    if op is fmha.ck.FwOp:
-        pytest.skip("mqa/gqa is only supported with ck-tiled fmha")
-
     torch.manual_seed(B * M + N * K + Hq * Hkv + Kv)
 
     scale = 3

From a0f7f2788781b4aeb2d464ca63bd2b560fb14a24 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 19:45:31 +0000
Subject: [PATCH 461/641] make skip test_wrong_alignment more generic

---
 tests/test_mem_eff_attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 8c7c10fba..2faf9f0be 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -2186,8 +2186,8 @@ def test_f32_biasf16(self) -> None:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
     def test_wrong_alignment(self, dtype) -> None:
         op = fmha.cutlass.FwOp if torch.version.cuda else fmha.ck.FwOp
-        if torch.version.hip and dtype is torch.float32:
-            pytest.skip("float32 is not supported by fmha.ck.FwOp!")
+        if dtype not in op.SUPPORTED_DTYPES:
+            pytest.skip(f"{dtype=} is not supported by {op.__module__}.{op.__qualname__}")
 
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
         try:

From a0d8dccb735ca81f40f3e0f21e7f518be6fcdba8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 19:46:35 +0000
Subject: [PATCH 462/641] reapply black

---
 setup.py                        | 4 +++-
 tests/test_mem_eff_attention.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index e44d58509..ce8242203 100644
--- a/setup.py
+++ b/setup.py
@@ -132,7 +132,9 @@ def get_hip_version(rocm_dir) -> str:
             [hipcc_bin, "--version"], universal_newlines=True
         )
     except Exception as e:
-        print(f"hip installation not found: {e} ROCM_PATH={os.environ.get('ROCM_PATH')}")
+        print(
+            f"hip installation not found: {e} ROCM_PATH={os.environ.get('ROCM_PATH')}"
+        )
         return None
     for line in raw_output.split("\n"):
         if "HIP version" in line:
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 2faf9f0be..c89435f80 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -2187,7 +2187,9 @@ def test_f32_biasf16(self) -> None:
     def test_wrong_alignment(self, dtype) -> None:
         op = fmha.cutlass.FwOp if torch.version.cuda else fmha.ck.FwOp
         if dtype not in op.SUPPORTED_DTYPES:
-            pytest.skip(f"{dtype=} is not supported by {op.__module__}.{op.__qualname__}")
+            pytest.skip(
+                f"{dtype=} is not supported by {op.__module__}.{op.__qualname__}"
+            )
 
         q, k, v, bias = self.create_tensors(dtype, Mq=7, Mkv=5)
         try:

From bc7035cb256b99fbc8bbbd1dc9ce51f62369d795 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 19:52:52 +0000
Subject: [PATCH 463/641] simplify test_decoder

---
 tests/test_mem_eff_attention.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index c89435f80..d7fb1e4ed 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -2002,26 +2002,14 @@ def dequant_cache(x):
         k = dequant_cache(k)
         v = dequant_cache(v)
 
-    if torch.version.cuda:
-        cutlass_output = fmha.memory_efficient_attention_forward(
-            q, k, v, attn_bias, op=fmha.cutlass.FwOp
-        )
-
-        assert_allclose(
-            decoder_output,
-            cutlass_output,
-            atol=fmha.cutlass.FwOp.ERROR_ATOL[dtype_] * 4,
-            rtol=fmha.cutlass.FwOp.ERROR_RTOL[dtype_],
-        )
-    else:
-        ref_output = ref_attention(q, k, v, attn_bias)
+    ref_output = ref_attention(q, k, v, attn_bias)
 
-        assert_allclose(
-            decoder_output.float(),
-            ref_output,
-            atol=fmha.cutlass.FwOp.ERROR_ATOL[dtype_] * 4,
-            rtol=fmha.cutlass.FwOp.ERROR_RTOL[dtype_],
-        )
+    assert_allclose(
+        decoder_output.to(ref_output.dtype),
+        ref_output,
+        atol=op.ERROR_ATOL[dtype_] * 4,
+        rtol=op.ERROR_RTOL[dtype_],
+    )
 
 
 @sm80_or_better_only

From f02d0d44a235f5a92b893e9eb0482e30c7a12486 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 20:26:54 +0000
Subject: [PATCH 464/641] put python version check inside triton_splitk op

---
 tests/test_mem_eff_attention.py    |  7 ++-----
 xformers/ops/fmha/triton_splitk.py | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index d7fb1e4ed..1676eb440 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -2540,11 +2540,8 @@ def test_mqa_decoding(op: Type[fmha.AttentionFwOpBase], dtype, B_Mkv_H_K):
     k = k.expand(-1, -1, H, -1)
     v = v.expand(-1, -1, H, -1)
 
-    if (sys.version_info.major, sys.version_info.minor) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
-    if not op.supports(fmha.Inputs(q, k, v)):
-        pytest.skip("not supported")
+    if skip_reasons := op.not_supported_reasons(fmha.Inputs(q, k, v)):
+        pytest.skip("; ".join(skip_reasons))
     out = fmha.memory_efficient_attention_forward(q, k, v, op=op)
     ref = ref_attention(q, k, v)
     assert_allclose(
diff --git a/xformers/ops/fmha/triton_splitk.py b/xformers/ops/fmha/triton_splitk.py
index 1c4f6d942..59c2cdac1 100644
--- a/xformers/ops/fmha/triton_splitk.py
+++ b/xformers/ops/fmha/triton_splitk.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import sys
 from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple
 
 import torch
@@ -454,6 +455,13 @@ def _splitK_reduce(
     _splitK_reduce = None
 
 
+def _is_cuda_at_least_sm80(device: torch.device) -> bool:
+    return torch.version.cuda and torch.cuda.get_device_capability(device) >= (
+        8,
+        0,
+    )
+
+
 @register_operator
 class FwOp(AttentionFwOpBase):
     """Flash-Attention with Split-K. Supports fused int-4 K/V quantization.
@@ -512,6 +520,8 @@ def shape_not_supported_reasons(
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(FwOp, cls).not_supported_reasons(d)
+        if (sys.version_info.major, sys.version_info.minor) < (3, 9):
+            reasons.append("triton_splitk requires python 3.9 or above!")
         check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
         if d.key.dtype != torch.int32:
             check_lastdim_alignment_stride1(reasons, "key", d.key, 8)
@@ -520,10 +530,11 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             reasons.append("triton is not available")
         if d.device.type == "cuda":
             # Has only been tested on 8.0 / 9.0.
-            if torch.cuda.get_device_capability(d.device) < (8, 0):
+            if not _is_cuda_at_least_sm80(d.device):
                 reasons.append(
-                    "requires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
+                    "requires NVidia GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
                 )
+            # TODO: AMD GPU support matrix needs to be figured out. MI300X is tested to work.
 
         q_len = d.query.shape[1]
         if isinstance(d.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):

From 77a6c13be895a4e95fa06c1977baa85ba91387ad Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 20:40:05 +0000
Subject: [PATCH 465/641] fix logic

---
 xformers/ops/fmha/triton_splitk.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/triton_splitk.py b/xformers/ops/fmha/triton_splitk.py
index 59c2cdac1..f4f1c7bab 100644
--- a/xformers/ops/fmha/triton_splitk.py
+++ b/xformers/ops/fmha/triton_splitk.py
@@ -455,8 +455,12 @@ def _splitK_reduce(
     _splitK_reduce = None
 
 
+def _is_cuda() -> bool:
+    return torch.version.cuda
+
+
 def _is_cuda_at_least_sm80(device: torch.device) -> bool:
-    return torch.version.cuda and torch.cuda.get_device_capability(device) >= (
+    return _is_cuda() and torch.cuda.get_device_capability(device) >= (
         8,
         0,
     )
@@ -530,7 +534,7 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
             reasons.append("triton is not available")
         if d.device.type == "cuda":
             # Has only been tested on 8.0 / 9.0.
-            if not _is_cuda_at_least_sm80(d.device):
+            if _is_cuda() and not _is_cuda_at_least_sm80(d.device):
                 reasons.append(
                     "requires NVidia GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
                 )

From a7cd6788a677992a8dee80add83d0403e7986414 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 21:01:22 +0000
Subject: [PATCH 466/641] cleanup python3.9 checks in tests

---
 tests/test_mem_eff_attention.py | 61 ++++-----------------------------
 1 file changed, 7 insertions(+), 54 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 1676eb440..00c33f048 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -644,12 +644,6 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
         kv,
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
 
-    if op is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     if packed and not (k == kv and q_len == kv_len):
         pytest.skip(
             f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
@@ -845,12 +839,6 @@ def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
     if op is fmha.ck.FwOp:
         pytest.skip("logsumexp is not yet supported by ck-tiled fmha!")
 
-    if op is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     query, key, value, attn_bias = create_tensors(
         *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
     )
@@ -1350,11 +1338,6 @@ def test_cuda_streams(
     ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
     if device != "cuda":
         pytest.skip("Not CUDA")
-    if op is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
 
     bias_type = None
     opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = [
@@ -1574,11 +1557,8 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
         0, 3, 1, 2
     )
 
-    if op is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
+    if skip_reasons := op.not_supported_reasons(fmha.Inputs(q, q, q)):
+        pytest.skip("; ".join(skip_reasons))
 
     try:
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
@@ -1596,11 +1576,8 @@ def test_unsupported_stride_lastdim(op: Type[fmha.AttentionFwOpBase]):
 def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
     q = torch.empty([1, 2, 1, 33], device="cuda", dtype=torch.float16)[:, :, :, :32]
 
-    if op is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
+    if skip_reasons := op.not_supported_reasons(fmha.Inputs(q, q, q)):
+        pytest.skip("; ".join(skip_reasons))
 
     try:
         fmha.memory_efficient_attention(q, q, q, op=(op, None))
@@ -1978,6 +1955,9 @@ def test_decoder(
         k = k[..., :1, :].expand(k_shape)
         v = v[..., :1, :].expand(k_shape)
 
+    if skip_reasons := op.not_supported_reasons(fmha.Inputs(q, k, v)):
+        pytest.skip("; ".join(skip_reasons))
+
     attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
         q_seqlen=[num_queries] * bsz,
         kv_seqlen=k_seqlen,
@@ -2046,9 +2026,6 @@ def test_triton_splitk_decoder(
     if dequant:
         pytest.skip("dequant is not supported")
 
-    if (sys.version_info.major, sys.version_info.minor) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     # We omit dequant with f16: it needs a very high tol
     test_decoder(
         op,
@@ -2370,12 +2347,6 @@ def test_forward_gqa_one_group(opFW):
     k = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
     v = torch.randn([B, Mkv, 1, H, K], dtype=dtype, device="cuda") * 3
 
-    if opFW is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     supported = opFW.supports(fmha.Inputs(q, k, v))
     if not supported:
         supported_bmhk = opFW.supports(fmha.Inputs(q[:, :, 0], k[:, :, 0], v[:, :, 0]))
@@ -2565,12 +2536,6 @@ def test_empty_tensors_empty_query(
     if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
-    if opFW is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     query = query[:, :0]
     query.requires_grad_(True)
     key.requires_grad_(True)
@@ -2596,12 +2561,6 @@ def test_empty_tensors_empty_kv(
     if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
-    if opFW is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     key = key[:, :0]
     value = value[:, :0]
     query.requires_grad_(True)
@@ -2627,12 +2586,6 @@ def test_empty_tensors_empty_b(
     if torch.version.hip:
         pytest.skip("backward pass/gradience is not yet supported by ck-tiled fmha!")
 
-    if opFW is fmha.triton_splitk.FwOp and (
-        sys.version_info.major,
-        sys.version_info.minor,
-    ) <= (3, 8):
-        pytest.skip("triton_splitk requires python 3.9 or above!")
-
     query, key, value = query[:0], key[:0], value[:0]
     query.requires_grad_(True)
     key.requires_grad_(True)

From dea783d30f80563adf4ba4cdd33b7abe79e556dc Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 15 Feb 2024 21:52:53 +0000
Subject: [PATCH 467/641] cleanup test_attentions

---
 tests/test_attentions.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_attentions.py b/tests/test_attentions.py
index 31f7721fb..2bdbb2d1f 100644
--- a/tests/test_attentions.py
+++ b/tests/test_attentions.py
@@ -22,10 +22,6 @@
     build_attention,
 )
 
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
-
 DEVICES = (
     [torch.device("cpu")] if not torch.cuda.is_available() else [torch.device("cuda")]
 )
@@ -95,7 +91,6 @@ def noop(x):
     return multi_head
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("attn_dropout", [0.0, 0.3])
 @pytest.mark.parametrize("residual_dropout", [0.0, 0.1])
 @pytest.mark.parametrize("causal", [True, False])
@@ -112,6 +107,13 @@ def test_order_invariance(
     causal: bool,
     device: torch.device,
 ):
+    if (
+        torch.version.hip
+        and device == torch.device("cuda")
+        and attention_name == "local"
+    ):
+        # Backend calls into Sputnik library which isn't built on ROCm
+        device = torch.device("cpu")
 
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)
@@ -166,7 +168,6 @@ def test_order_invariance(
                 _ = multi_head(inputs, inputs_shuffled, inputs)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("heads", [1, 4])
 @pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
 @pytest.mark.parametrize("device", DEVICES)
@@ -210,7 +211,6 @@ def test_kqv_ordering(
     assert torch.allclose(res_false[0, :, :], res_false[1, :, :])
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("heads", [1, 4])
 @pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
 @pytest.mark.parametrize("device", DEVICES)

From acd6b7aaf676bb63b6816035b5bd5eeae7012053 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Feb 2024 01:11:06 +0000
Subject: [PATCH 468/641] cleanup test_checkpoint as test running on cpu does
 not depend on gpu platform

---
 tests/test_checkpoint.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 81ba73013..d3a831ce4 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -20,9 +20,6 @@
 )
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
 _devices = ["cpu"]
 cuda_cap = (0, 0)
 
@@ -39,7 +36,6 @@ def _all_policy(func, *args, **kwargs):
     return True
 
 
-@disable_on_rocm
 @pytest.mark.skipif(torch.__version__ < "2.2", reason="Only new PyTorch supported")
 @pytest.mark.parametrize("policy_fn", [None, [], _relu_policy, _all_policy])
 @pytest.mark.parametrize("input_requires_grad", [True, False])

From f467a1dd5e614c6b2e37828f310c83d5242f37da Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Feb 2024 18:26:52 +0000
Subject: [PATCH 469/641] fix lints

---
 tests/test_mem_eff_attention.py    | 1 -
 xformers/ops/fmha/triton_splitk.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 00c33f048..e76b7a0c9 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -5,7 +5,6 @@
 
 import math
 import random
-import sys
 from functools import partial
 from typing import List, Optional, Sequence, Tuple, Type, TypeVar
 
diff --git a/xformers/ops/fmha/triton_splitk.py b/xformers/ops/fmha/triton_splitk.py
index f4f1c7bab..1b6039db0 100644
--- a/xformers/ops/fmha/triton_splitk.py
+++ b/xformers/ops/fmha/triton_splitk.py
@@ -456,7 +456,7 @@ def _splitK_reduce(
 
 
 def _is_cuda() -> bool:
-    return torch.version.cuda
+    return torch.version.cuda is not None
 
 
 def _is_cuda_at_least_sm80(device: torch.device) -> bool:

From d758eac0223e8cf24f80f9557202186ac0fc2838 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Feb 2024 19:27:09 +0000
Subject: [PATCH 470/641] try fixing win build by conditional import of triton
 in triton op

---
 xformers/ops/fmha/triton.py | 741 ++++++++++++++++++------------------
 1 file changed, 376 insertions(+), 365 deletions(-)

diff --git a/xformers/ops/fmha/triton.py b/xformers/ops/fmha/triton.py
index f2a538ac4..46ae836dc 100644
--- a/xformers/ops/fmha/triton.py
+++ b/xformers/ops/fmha/triton.py
@@ -16,8 +16,8 @@
 from typing import Any, List, Mapping, Optional, Set, Tuple
 
 import torch
-import triton
-import triton.language as tl
+
+from xformers import _is_triton_available
 
 from ..common import register_operator
 from .attn_bias import (
@@ -27,251 +27,12 @@
 )
 from .common import AttentionFwOpBase, Context, Inputs, check_lastdim_alignment_stride1
 
+if _is_triton_available():
+    import triton
+    import triton.language as tl
 
-@triton.jit
-def _fwd_kernel_triton_flash_inner(
-    acc,
-    l_i,
-    m_i,
-    q,
-    K_block_ptr,
-    V_block_ptr,
-    q_seq_start,
-    lo,
-    hi,
-    start_m,
-    qk_scale,
-    kv_len,
-    offs_m,
-    offs_n,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BOUNDS_CHECKS_N: tl.constexpr,
-    CAST_BEFORE_MATMUL: tl.constexpr,
-    ALLOW_TF32: tl.constexpr,
-    STAGE: tl.constexpr,
-    pre_load_v: tl.constexpr,
-):
-    BOUNDS_CHECKS_STAGE: tl.constexpr = BOUNDS_CHECKS_N and STAGE == 2
-    # Doesn't seem to make a difference
-    if STAGE == 1:
-        lo = 0
-    else:
-        lo = tl.multiple_of(lo, BLOCK_N)
-        K_block_ptr = tl.advance(K_block_ptr, (0, lo))
-        V_block_ptr = tl.advance(V_block_ptr, (lo, 0))
-
-    # loop over k, v and update accumulator
-    for start_n in range(lo, hi, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)  # doesn't seem to make a difference
-        # -- load k, v --
-        k = tl.load(K_block_ptr, boundary_check=(1,) if BOUNDS_CHECKS_STAGE else ())
-        # Moving masking here seems to introduce num errors,
-        # e.g. in test_forward[tritonflashattF-cuda-torch.bfloat16-NoneType-1-256-15-1-32-32-False-BMHK]
-        # if BOUNDS_CHECKS_N or USE_SEQ_LEN:
-        #     k = tl.where(hi - tl.arange(0, BLOCK_N) > start_n, k, float("-inf"))
-        if pre_load_v:
-            v = tl.load(V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_STAGE else ())
-        # -- compute qk ---
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q.to(k.dtype), k, allow_tf32=ALLOW_TF32) * qk_scale
-        if CAST_BEFORE_MATMUL:
-            k = k.to(tl.float32)
-        if STAGE == 2:
-            if IS_CAUSAL:
-                # For some reason this is faster than start_n <= q_seq_start + offs_m[:, None] - offs_n[None, :]
-                qk = tl.where(
-                    q_seq_start + offs_m[:, None] >= (start_n + offs_n[None, :]),
-                    qk,
-                    float("-inf"),
-                )
-            if BOUNDS_CHECKS_N:
-                qk = tl.where(tl.arange(0, BLOCK_N) < hi - start_n, qk, float("-inf"))
-
-        # -- compute scaling constant ---
-        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        qk = qk - m_i_new[:, None]
-        alpha = tl.math.exp2(m_i - m_i_new)
-        p = tl.math.exp2(qk)
-
-        # -- scale and update acc --
-        acc *= alpha[:, None]
-        if not pre_load_v:
-            v = tl.load(V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_STAGE else ())
-        if CAST_BEFORE_MATMUL:
-            v = v.to(tl.float32)
-        acc += tl.dot(p.to(v.dtype), v, allow_tf32=ALLOW_TF32)
-        # -- update m_i and l_i --
-        l_i = l_i * alpha + tl.sum(p, 1)
-        m_i = m_i_new
-        # update pointers
-        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
-        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
-    return acc, l_i, m_i
-
-
-@triton.jit
-def _fwd_kernel_triton_flash(
-    Q,
-    K,
-    V,
-    sm_scale,
-    L,
-    Out,
-    Seq_len,
-    Seq_pos_q,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    Z,
-    H,
-    N_CTX,
-    Mkv,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BOUNDS_CHECKS_N: tl.constexpr,
-    BOUNDS_CHECKS_M: tl.constexpr,
-    ALLOW_TF32: tl.constexpr,
-    CAST_BEFORE_MATMUL: tl.constexpr,
-    USE_SEQ_LEN_KV: tl.constexpr,
-    USE_SEQ_POS_Q: tl.constexpr,
-    IS_KV_PADDED: tl.constexpr,  # Switch between padded and non-padded block-diagonal causal masks
-    pre_load_v: tl.constexpr,  # TODO: understand if that matters
-):
-    start_m = tl.program_id(0).to(tl.int64)
-    off_hz = tl.program_id(1).to(tl.int64)
-
-    tl.static_assert((IS_KV_PADDED and USE_SEQ_POS_Q) or not IS_KV_PADDED)
-
-    off_z = off_hz // H
-    off_h = off_hz % H
-    if USE_SEQ_POS_Q:
-        seqpos = tl.load(Seq_pos_q + off_z)
-        seqpos_next = tl.load(Seq_pos_q + off_z + 1)
-        q_len = seqpos_next - seqpos
-        q_offset = seqpos * stride_qm + off_h * stride_qh
-        out_offset = seqpos * stride_om + off_h * stride_oh
-        if not IS_KV_PADDED:
-            # BlockDiagonalCausalMask, no padding, use same sequence positions as for Q
-            kv_offset = seqpos * stride_kn + off_h * stride_kh
-            kv_len = q_len
-            q_seq_start = 0
-        else:
-            # BlockDiagonalCausalWithOffsetPaddedKeysMask
-            kv_offset = off_z * stride_kz + off_h * stride_kh
-            if USE_SEQ_LEN_KV:
-                kv_len = tl.load(Seq_len + off_z)
-                q_seq_start = kv_len - q_len
-            else:
-                # if no variable K/V seqlens are provided, assume full length
-                kv_len = Mkv
-                q_seq_start = 0
-    else:
-        # No mask or simple causal mask
-        q_len = N_CTX
-        q_offset = off_z * stride_qz + off_h * stride_qh
-        out_offset = off_z * stride_oz + off_h * stride_oh
-
-        kv_len = Mkv
-        q_seq_start = 0
-        kv_offset = off_z * stride_kz + off_h * stride_kh
-
-    Q_block_ptr = tl.make_block_ptr(
-        base=Q + q_offset,
-        shape=(q_len, BLOCK_DMODEL),
-        strides=(stride_qm, stride_qk),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    K_block_ptr = tl.make_block_ptr(
-        base=K + kv_offset,
-        shape=(BLOCK_DMODEL, kv_len),
-        strides=(stride_kk, stride_kn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_DMODEL, BLOCK_N),
-        order=(0, 1),
-    )
-    V_block_ptr = tl.make_block_ptr(
-        base=V + kv_offset,
-        shape=(kv_len, BLOCK_DMODEL),
-        strides=(stride_vk, stride_vn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_N, BLOCK_DMODEL),
-        order=(0, 1),
-    )
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)  # For Q
-    offs_n = tl.arange(0, BLOCK_N)  # For K/V
-    # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # scale sm_scale by log_2(e) and use
-    # 2^x instead of exp in the loop because CSE and LICM
-    # don't work as expected with `exp` in the loop
-    qk_scale = sm_scale * 1.44269504
-    # load q: it will stay in SRAM throughout on NV GPUs but in VGPRs on AMD GPUs
-    q = tl.load(
-        Q_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_M or USE_SEQ_POS_Q else ()
-    )
-
-    # The loop over K/V sequence blocks is divided into two stages:
-    # Stage 1: (many) blocks which don't need boundary conditions checks - not touching sequence end or diagonal
-    # Stage 2: (few) blocks which need boundary conditions checks
-    # Following https://github.com/openai/triton/blob/293b7fd592a1602f2305c1bd0bc978bbd97337d6/python/tutorials/06-fused-attention.py  # noqa: E501
-
-    """
-    Iteration doesn't need masking if
-        - 1) block doesn't cross the diagonal: max(kv_pos) <= min(q_pos)
-        - 2) block doesn't cross the end of the sequence: max(kv_pos) < kv_len
-    Find maximum start_n for which condition 1 is satisifed.
-    Remember that
-        q_pos = q_seq_start + offs_m[:, None]
-        kv_pos = start_n + offs_n[None, :]
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        offs_n = tl.arange(0, BLOCK_N)
-    min(q_pos) = q_seq_start + start_m * BLOCK_M
-    max(kv_pos) = start_n + BLOCK_N - 1
-    So the condition becomes
-        q_seq_start + start_m * BLOCK_M >= start_n + BLOCK_N - 1
-    So:
-    1) start_n <= q_seq_start + start_m * BLOCK_M - BLOCK_N + 1
-    2) start_n <= kv_len - BLOCK_N
-
-    So the last allowed start_n without masking is min(q_seq_start + start_m * BLOCK_M + 1, kv_len) - BLOCK_N
-    """
-    # Second stage can only be skipped if no mask is used and K/V length is divisible by the tile size
-    TWO_STAGES: tl.constexpr = BOUNDS_CHECKS_N or (
-        IS_CAUSAL or (USE_SEQ_LEN_KV or (USE_SEQ_POS_Q and not IS_KV_PADDED))
-    )
-    if TWO_STAGES:
-        # Border between two stages
-        hi_stage_1 = min(q_seq_start + start_m * BLOCK_M + 1, kv_len) - BLOCK_N
-        hi_stage_1 = (
-            hi_stage_1 // BLOCK_N
-        ) * BLOCK_N  # Don't understand why it doesn't work without this
-    else:
-        hi_stage_1 = kv_len
-
-    # Stage 1 - no boundary conditions
-    acc, l_i, m_i = _fwd_kernel_triton_flash_inner(
+    @triton.jit
+    def _fwd_kernel_triton_flash_inner(
         acc,
         l_i,
         m_i,
@@ -279,31 +40,247 @@ def _fwd_kernel_triton_flash(
         K_block_ptr,
         V_block_ptr,
         q_seq_start,
-        0,
-        hi_stage_1,
+        lo,
+        hi,
         start_m,
         qk_scale,
         kv_len,
         offs_m,
         offs_n,
-        BLOCK_M,
-        BLOCK_N,
-        IS_CAUSAL,
-        BOUNDS_CHECKS_N,
-        CAST_BEFORE_MATMUL,
-        ALLOW_TF32,
-        STAGE=1,
-        pre_load_v=pre_load_v,
-    )
-    if TWO_STAGES:
-        hi = (
-            tl.minimum(kv_len, q_seq_start + (start_m + 1) * BLOCK_M)
-            if IS_CAUSAL
-            else kv_len
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        IS_CAUSAL: tl.constexpr,
+        BOUNDS_CHECKS_N: tl.constexpr,
+        CAST_BEFORE_MATMUL: tl.constexpr,
+        ALLOW_TF32: tl.constexpr,
+        STAGE: tl.constexpr,
+        pre_load_v: tl.constexpr,
+    ):
+        BOUNDS_CHECKS_STAGE: tl.constexpr = BOUNDS_CHECKS_N and STAGE == 2
+        # Doesn't seem to make a difference
+        if STAGE == 1:
+            lo = 0
+        else:
+            lo = tl.multiple_of(lo, BLOCK_N)
+            K_block_ptr = tl.advance(K_block_ptr, (0, lo))
+            V_block_ptr = tl.advance(V_block_ptr, (lo, 0))
+
+        # loop over k, v and update accumulator
+        for start_n in range(lo, hi, BLOCK_N):
+            start_n = tl.multiple_of(
+                start_n, BLOCK_N
+            )  # doesn't seem to make a difference
+            # -- load k, v --
+            k = tl.load(K_block_ptr, boundary_check=(1,) if BOUNDS_CHECKS_STAGE else ())
+            # Moving masking here seems to introduce num errors,
+            # e.g. in test_forward[tritonflashattF-cuda-torch.bfloat16-NoneType-1-256-15-1-32-32-False-BMHK]
+            # if BOUNDS_CHECKS_N or USE_SEQ_LEN:
+            #     k = tl.where(hi - tl.arange(0, BLOCK_N) > start_n, k, float("-inf"))
+            if pre_load_v:
+                v = tl.load(
+                    V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_STAGE else ()
+                )
+            # -- compute qk ---
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q.to(k.dtype), k, allow_tf32=ALLOW_TF32) * qk_scale
+            if CAST_BEFORE_MATMUL:
+                k = k.to(tl.float32)
+            if STAGE == 2:
+                if IS_CAUSAL:
+                    # For some reason this is faster than start_n <= q_seq_start + offs_m[:, None] - offs_n[None, :]
+                    qk = tl.where(
+                        q_seq_start + offs_m[:, None] >= (start_n + offs_n[None, :]),
+                        qk,
+                        float("-inf"),
+                    )
+                if BOUNDS_CHECKS_N:
+                    qk = tl.where(
+                        tl.arange(0, BLOCK_N) < hi - start_n, qk, float("-inf")
+                    )
+
+            # -- compute scaling constant ---
+            m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+            qk = qk - m_i_new[:, None]
+            alpha = tl.math.exp2(m_i - m_i_new)
+            p = tl.math.exp2(qk)
+
+            # -- scale and update acc --
+            acc *= alpha[:, None]
+            if not pre_load_v:
+                v = tl.load(
+                    V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_STAGE else ()
+                )
+            if CAST_BEFORE_MATMUL:
+                v = v.to(tl.float32)
+            acc += tl.dot(p.to(v.dtype), v, allow_tf32=ALLOW_TF32)
+            # -- update m_i and l_i --
+            l_i = l_i * alpha + tl.sum(p, 1)
+            m_i = m_i_new
+            # update pointers
+            K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+            V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        return acc, l_i, m_i
+
+    @triton.jit
+    def _fwd_kernel_triton_flash(
+        Q,
+        K,
+        V,
+        sm_scale,
+        L,
+        Out,
+        Seq_len,
+        Seq_pos_q,
+        stride_qz,
+        stride_qh,
+        stride_qm,
+        stride_qk,
+        stride_kz,
+        stride_kh,
+        stride_kn,
+        stride_kk,
+        stride_vz,
+        stride_vh,
+        stride_vk,
+        stride_vn,
+        stride_oz,
+        stride_oh,
+        stride_om,
+        stride_on,
+        Z,
+        H,
+        N_CTX,
+        Mkv,
+        BLOCK_M: tl.constexpr,
+        BLOCK_DMODEL: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        IS_CAUSAL: tl.constexpr,
+        BOUNDS_CHECKS_N: tl.constexpr,
+        BOUNDS_CHECKS_M: tl.constexpr,
+        ALLOW_TF32: tl.constexpr,
+        CAST_BEFORE_MATMUL: tl.constexpr,
+        USE_SEQ_LEN_KV: tl.constexpr,
+        USE_SEQ_POS_Q: tl.constexpr,
+        IS_KV_PADDED: tl.constexpr,  # Switch between padded and non-padded block-diagonal causal masks
+        pre_load_v: tl.constexpr,  # TODO: understand if that matters
+    ):
+        start_m = tl.program_id(0).to(tl.int64)
+        off_hz = tl.program_id(1).to(tl.int64)
+
+        tl.static_assert((IS_KV_PADDED and USE_SEQ_POS_Q) or not IS_KV_PADDED)
+
+        off_z = off_hz // H
+        off_h = off_hz % H
+        if USE_SEQ_POS_Q:
+            seqpos = tl.load(Seq_pos_q + off_z)
+            seqpos_next = tl.load(Seq_pos_q + off_z + 1)
+            q_len = seqpos_next - seqpos
+            q_offset = seqpos * stride_qm + off_h * stride_qh
+            out_offset = seqpos * stride_om + off_h * stride_oh
+            if not IS_KV_PADDED:
+                # BlockDiagonalCausalMask, no padding, use same sequence positions as for Q
+                kv_offset = seqpos * stride_kn + off_h * stride_kh
+                kv_len = q_len
+                q_seq_start = 0
+            else:
+                # BlockDiagonalCausalWithOffsetPaddedKeysMask
+                kv_offset = off_z * stride_kz + off_h * stride_kh
+                if USE_SEQ_LEN_KV:
+                    kv_len = tl.load(Seq_len + off_z)
+                    q_seq_start = kv_len - q_len
+                else:
+                    # if no variable K/V seqlens are provided, assume full length
+                    kv_len = Mkv
+                    q_seq_start = 0
+        else:
+            # No mask or simple causal mask
+            q_len = N_CTX
+            q_offset = off_z * stride_qz + off_h * stride_qh
+            out_offset = off_z * stride_oz + off_h * stride_oh
+
+            kv_len = Mkv
+            q_seq_start = 0
+            kv_offset = off_z * stride_kz + off_h * stride_kh
+
+        Q_block_ptr = tl.make_block_ptr(
+            base=Q + q_offset,
+            shape=(q_len, BLOCK_DMODEL),
+            strides=(stride_qm, stride_qk),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_DMODEL),
+            order=(1, 0),
         )
-        # Do we need this barrier?
-        # tl.debug_barrier()
-        # Stage 2 - with boundary conditions
+        K_block_ptr = tl.make_block_ptr(
+            base=K + kv_offset,
+            shape=(BLOCK_DMODEL, kv_len),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, 0),
+            block_shape=(BLOCK_DMODEL, BLOCK_N),
+            order=(0, 1),
+        )
+        V_block_ptr = tl.make_block_ptr(
+            base=V + kv_offset,
+            shape=(kv_len, BLOCK_DMODEL),
+            strides=(stride_vk, stride_vn),
+            offsets=(0, 0),
+            block_shape=(BLOCK_N, BLOCK_DMODEL),
+            order=(0, 1),
+        )
+        # initialize offsets
+        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)  # For Q
+        offs_n = tl.arange(0, BLOCK_N)  # For K/V
+        # initialize pointer to m and l
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        # scale sm_scale by log_2(e) and use
+        # 2^x instead of exp in the loop because CSE and LICM
+        # don't work as expected with `exp` in the loop
+        qk_scale = sm_scale * 1.44269504
+        # load q: it will stay in SRAM throughout on NV GPUs but in VGPRs on AMD GPUs
+        q = tl.load(
+            Q_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_M or USE_SEQ_POS_Q else ()
+        )
+
+        # The loop over K/V sequence blocks is divided into two stages:
+        # Stage 1: (many) blocks which don't need boundary conditions checks - not touching sequence end or diagonal
+        # Stage 2: (few) blocks which need boundary conditions checks
+        # Following https://github.com/openai/triton/blob/293b7fd592a1602f2305c1bd0bc978bbd97337d6/python/tutorials/06-fused-attention.py  # noqa: E501
+
+        """
+        Iteration doesn't need masking if
+            - 1) block doesn't cross the diagonal: max(kv_pos) <= min(q_pos)
+            - 2) block doesn't cross the end of the sequence: max(kv_pos) < kv_len
+        Find maximum start_n for which condition 1 is satisifed.
+        Remember that
+            q_pos = q_seq_start + offs_m[:, None]
+            kv_pos = start_n + offs_n[None, :]
+            offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+            offs_n = tl.arange(0, BLOCK_N)
+        min(q_pos) = q_seq_start + start_m * BLOCK_M
+        max(kv_pos) = start_n + BLOCK_N - 1
+        So the condition becomes
+            q_seq_start + start_m * BLOCK_M >= start_n + BLOCK_N - 1
+        So:
+        1) start_n <= q_seq_start + start_m * BLOCK_M - BLOCK_N + 1
+        2) start_n <= kv_len - BLOCK_N
+
+        So the last allowed start_n without masking is min(q_seq_start + start_m * BLOCK_M + 1, kv_len) - BLOCK_N
+        """
+        # Second stage can only be skipped if no mask is used and K/V length is divisible by the tile size
+        TWO_STAGES: tl.constexpr = BOUNDS_CHECKS_N or (
+            IS_CAUSAL or (USE_SEQ_LEN_KV or (USE_SEQ_POS_Q and not IS_KV_PADDED))
+        )
+        if TWO_STAGES:
+            # Border between two stages
+            hi_stage_1 = min(q_seq_start + start_m * BLOCK_M + 1, kv_len) - BLOCK_N
+            hi_stage_1 = (
+                hi_stage_1 // BLOCK_N
+            ) * BLOCK_N  # Don't understand why it doesn't work without this
+        else:
+            hi_stage_1 = kv_len
+
+        # Stage 1 - no boundary conditions
         acc, l_i, m_i = _fwd_kernel_triton_flash_inner(
             acc,
             l_i,
@@ -312,8 +289,8 @@ def _fwd_kernel_triton_flash(
             K_block_ptr,
             V_block_ptr,
             q_seq_start,
+            0,
             hi_stage_1,
-            hi,
             start_m,
             qk_scale,
             kv_len,
@@ -325,108 +302,142 @@ def _fwd_kernel_triton_flash(
             BOUNDS_CHECKS_N,
             CAST_BEFORE_MATMUL,
             ALLOW_TF32,
-            STAGE=2,
+            STAGE=1,
             pre_load_v=pre_load_v,
         )
+        if TWO_STAGES:
+            hi = (
+                tl.minimum(kv_len, q_seq_start + (start_m + 1) * BLOCK_M)
+                if IS_CAUSAL
+                else kv_len
+            )
+            # Do we need this barrier?
+            # tl.debug_barrier()
+            # Stage 2 - with boundary conditions
+            acc, l_i, m_i = _fwd_kernel_triton_flash_inner(
+                acc,
+                l_i,
+                m_i,
+                q,
+                K_block_ptr,
+                V_block_ptr,
+                q_seq_start,
+                hi_stage_1,
+                hi,
+                start_m,
+                qk_scale,
+                kv_len,
+                offs_m,
+                offs_n,
+                BLOCK_M,
+                BLOCK_N,
+                IS_CAUSAL,
+                BOUNDS_CHECKS_N,
+                CAST_BEFORE_MATMUL,
+                ALLOW_TF32,
+                STAGE=2,
+                pre_load_v=pre_load_v,
+            )
+
+        # write back l and m
+        acc1 = acc / l_i[:, None]
+        l_ptrs = L + off_hz * N_CTX + offs_m
+        # Save LSE, converting from log2 to natural logarithm
+        l_mask = (
+            start_m * BLOCK_M + tl.arange(0, BLOCK_M) < q_len
+            if BOUNDS_CHECKS_M
+            else None
+        )
+        tl.store(l_ptrs, (m_i + tl.math.log2(l_i)) / 1.44269504, mask=l_mask)
+        # write back O
+        O_block_ptr = tl.make_block_ptr(
+            base=Out + out_offset,
+            shape=(q_len, BLOCK_DMODEL),
+            strides=(stride_om, stride_on),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_DMODEL),
+            order=(1, 0),
+        )
+        tl.store(
+            O_block_ptr,
+            acc1.to(Out.dtype.element_ty),
+            boundary_check=(0,) if BOUNDS_CHECKS_M or USE_SEQ_POS_Q else (),
+        )
+
+    _autotuner_config_amd_full = [
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 64, "waves_per_eu": 2, "pre_load_v": False},
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "waves_per_eu": 2, "pre_load_v": False},
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "waves_per_eu": 2, "pre_load_v": False},
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "pre_load_v": True},
+            num_stages=1,
+            num_warps=4,
+        ),  # d64-False
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "pre_load_v": False},
+            num_stages=1,
+            num_warps=4,
+        ),  # d64-True
+    ]
+
+    _autotuner_config_amd_dummy = [
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 2, "pre_load_v": False},
+            num_stages=1,
+            num_warps=8,
+        ),
+    ]
+
+    _autotuner_config_nvidia_dummy = [
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "pre_load_v": False},
+            num_stages=1,
+            num_warps=8,
+        ),
+    ]
+
+    def autotune_kernel(kernel, autotune):
 
-    # write back l and m
-    acc1 = acc / l_i[:, None]
-    l_ptrs = L + off_hz * N_CTX + offs_m
-    # Save LSE, converting from log2 to natural logarithm
-    l_mask = (
-        start_m * BLOCK_M + tl.arange(0, BLOCK_M) < q_len if BOUNDS_CHECKS_M else None
-    )
-    tl.store(l_ptrs, (m_i + tl.math.log2(l_i)) / 1.44269504, mask=l_mask)
-    # write back O
-    O_block_ptr = tl.make_block_ptr(
-        base=Out + out_offset,
-        shape=(q_len, BLOCK_DMODEL),
-        strides=(stride_om, stride_on),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    tl.store(
-        O_block_ptr,
-        acc1.to(Out.dtype.element_ty),
-        boundary_check=(0,) if BOUNDS_CHECKS_M or USE_SEQ_POS_Q else (),
-    )
-
-
-_autotuner_config_amd_full = [
-    triton.Config(
-        {"BLOCK_M": 256, "BLOCK_N": 64, "waves_per_eu": 2, "pre_load_v": False},
-        num_stages=1,
-        num_warps=8,
-    ),
-    triton.Config(
-        {"BLOCK_M": 128, "BLOCK_N": 128, "waves_per_eu": 2, "pre_load_v": False},
-        num_stages=1,
-        num_warps=4,
-    ),
-    triton.Config(
-        {"BLOCK_M": 256, "BLOCK_N": 128, "waves_per_eu": 2, "pre_load_v": False},
-        num_stages=1,
-        num_warps=8,
-    ),
-    triton.Config(
-        {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "pre_load_v": True},
-        num_stages=1,
-        num_warps=4,
-    ),  # d64-False
-    triton.Config(
-        {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "pre_load_v": False},
-        num_stages=1,
-        num_warps=4,
-    ),  # d64-True
-]
-
-
-_autotuner_config_amd_dummy = [
-    triton.Config(
-        {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 2, "pre_load_v": False},
-        num_stages=1,
-        num_warps=8,
-    ),
-]
-
-_autotuner_config_nvidia_dummy = [
-    triton.Config(
-        {"BLOCK_M": 128, "BLOCK_N": 64, "pre_load_v": False},
-        num_stages=1,
-        num_warps=8,
-    ),
-]
-
-
-def autotune_kernel(kernel, autotune):
-
-    kernel = triton.heuristics(
-        values={
-            "BOUNDS_CHECKS_N": lambda args: ((args["Mkv"] % args["BLOCK_N"]) != 0)
-            or (args["USE_SEQ_POS_Q"] and not args["IS_KV_PADDED"]),
-            "BOUNDS_CHECKS_M": lambda args: (args["N_CTX"] % args["BLOCK_M"]) != 0,
-        }
-    )(kernel)
-
-    if torch.version.cuda:
-        configs = _autotuner_config_nvidia_dummy
-    elif autotune:
-        configs = _autotuner_config_amd_full
-    else:
-        configs = _autotuner_config_amd_dummy
-
-    kernel = triton.autotune(
-        configs=configs,
-        key=["Z", "H", "N_CTX", "IS_CAUSAL", "BLOCK_DMODEL"],
-    )(kernel)
-    return kernel
-
-
-_fwd_kernel_triton_flash_maybe_autotuned = {
-    True: autotune_kernel(_fwd_kernel_triton_flash, True),
-    False: autotune_kernel(_fwd_kernel_triton_flash, False),
-}
+        kernel = triton.heuristics(
+            values={
+                "BOUNDS_CHECKS_N": lambda args: ((args["Mkv"] % args["BLOCK_N"]) != 0)
+                or (args["USE_SEQ_POS_Q"] and not args["IS_KV_PADDED"]),
+                "BOUNDS_CHECKS_M": lambda args: (args["N_CTX"] % args["BLOCK_M"]) != 0,
+            }
+        )(kernel)
+
+        if torch.version.cuda:
+            configs = _autotuner_config_nvidia_dummy
+        elif autotune:
+            configs = _autotuner_config_amd_full
+        else:
+            configs = _autotuner_config_amd_dummy
+
+        kernel = triton.autotune(
+            configs=configs,
+            key=["Z", "H", "N_CTX", "IS_CAUSAL", "BLOCK_DMODEL"],
+        )(kernel)
+        return kernel
+
+    _fwd_kernel_triton_flash_maybe_autotuned = {
+        True: autotune_kernel(_fwd_kernel_triton_flash, True),
+        False: autotune_kernel(_fwd_kernel_triton_flash, False),
+    }
+else:
+    _fwd_kernel_triton_flash = None
+    _fwd_kernel_triton_flash_maybe_autotuned = dict()
 
 
 def _prepare_inputs(inp: Inputs) -> Inputs:

From 21f190455a2f17d8d28fe6880f32cfce1ced97ca Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 00:50:12 +0000
Subject: [PATCH 471/641] re-enable test_triton_layernorm as it passes

---
 tests/test_triton_layernorm.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/test_triton_layernorm.py b/tests/test_triton_layernorm.py
index 50dde39bb..954dca4f1 100644
--- a/tests/test_triton_layernorm.py
+++ b/tests/test_triton_layernorm.py
@@ -12,10 +12,6 @@
 
 import xformers
 
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
-
 try:
     from xformers.triton import FusedLayerNorm
 
@@ -38,7 +34,6 @@
 ]
 
 
-@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton is not available")
 @pytest.mark.parametrize("shape", SHAPES)
 @pytest.mark.parametrize("amp", [True, False])
@@ -103,7 +98,6 @@ def test_layernorm_parity(shape, amp):
         )
 
 
-@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton is not available")
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.bfloat16])
 def test_no_contiguous(dtype):

From d880c365aef3d5a953ca06b2d0bbf33cf59f6682 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 00:53:33 +0000
Subject: [PATCH 472/641] re-enable test_triton_blocksparse as it passes

---
 tests/test_triton_blocksparse.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/test_triton_blocksparse.py b/tests/test_triton_blocksparse.py
index 8c458f457..a56386bd4 100644
--- a/tests/test_triton_blocksparse.py
+++ b/tests/test_triton_blocksparse.py
@@ -13,10 +13,6 @@
 from xformers.components.attention import build_attention
 from xformers.components.attention.attention_patterns import block_sparsify_tensor
 
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
-
 
 def catch_oor(fn):
     @functools.wraps(fn)
@@ -64,7 +60,6 @@ def mask_tensor(x, mask, block, value=0):
     return ret
 
 
-@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("MODE", _matmul_types)
 @pytest.mark.parametrize("TRANS_A", [False, True])
@@ -116,7 +111,6 @@ def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=32, H=2, M=512, N=384, K
     torch.testing.assert_close(rc, tc)
 
 
-@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("BLOCK", [32, 128])
 @pytest.mark.parametrize("WIDTH", [256, 576, 1024, 1792])
@@ -147,7 +141,6 @@ def test_softmax(BLOCK, WIDTH, DTYPE):
     torch.testing.assert_close(ry, ty)
 
 
-@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("block", [32, 43, 128])  # 16, 32,
 @pytest.mark.parametrize("dtype", [torch.float16])
@@ -221,7 +214,6 @@ def loss_fn(x):
             )
 
 
-@disable_on_rocm
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
 @pytest.mark.parametrize("dtype", [torch.float16])
 def test_blocksparse_attention_parity(dtype):

From 059c84fa7594a2d6f49c7c914e2975aee877c548 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 01:05:27 +0000
Subject: [PATCH 473/641] cleanup test_sparse_tensors

---
 tests/test_sparse_tensors.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/test_sparse_tensors.py b/tests/test_sparse_tensors.py
index 21246c175..d4ab76002 100644
--- a/tests/test_sparse_tensors.py
+++ b/tests/test_sparse_tensors.py
@@ -12,13 +12,9 @@
 from xformers.sparse import BlockSparseTensor, SparseCSRTensor
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda:0"] if torch.cuda.is_available() else ["cpu"]
+_devices = ["cpu", "cuda:0"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
 _tensor_types = [BlockSparseTensor, SparseCSRTensor]
 
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
-
 
 def _create_blocksparse_tensor(
     device, block_size=32, Z=8, C=2, H=64, W=64, dtype=torch.float32
@@ -105,7 +101,6 @@ def test_sparse_binary_ops(func, device):
     assert torch.allclose(res, res_gt)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
 def test_masked_matmul(tensor_type, device):
@@ -158,7 +153,6 @@ def test_masked_matmul(tensor_type, device):
     assert torch.allclose(b.grad, bb.grad, atol=atol)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
 def test_bmm(tensor_type, device):
@@ -208,7 +202,6 @@ def test_bmm(tensor_type, device):
     ), f"{torch.max(torch.abs(a_grad-a_sparse.grad.to_dense()))}"
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("tensor_type", _tensor_types)
 @pytest.mark.parametrize("device", _devices)
 def test_sparse_softmax(tensor_type, device):

From 8aa0bdc52312dbcd1bbe49a0dd52dbe417e6ad26 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 01:10:38 +0000
Subject: [PATCH 474/641] cleanup test_custom_ops

---
 tests/test_custom_ops.py | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/tests/test_custom_ops.py b/tests/test_custom_ops.py
index 676952df7..4d9e61890 100644
--- a/tests/test_custom_ops.py
+++ b/tests/test_custom_ops.py
@@ -16,12 +16,9 @@
     _sparse_bmm,
 )
 
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
+cuda_only = pytest.mark.skipif(not torch.cuda.is_available() or not torch.version.cuda, reason="requires CUDA")
 
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
 
 
 def _baseline_matmul_with_sparse_mask(
@@ -62,7 +59,6 @@ def _baseline_sparse_bmm(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
     return torch.stack(out, dim=0)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("is_sparse", [True, False])
 @pytest.mark.parametrize("contiguous", [True, False])
 @pytest.mark.parametrize("device", _devices)
@@ -94,7 +90,6 @@ def test_matmul_with_mask(device, contiguous, is_sparse):
     assert torch.allclose(res, res_gt)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("is_sparse", [True, False])
 @pytest.mark.parametrize("contiguous", [True, False])
 @pytest.mark.parametrize("device", _devices)
@@ -137,7 +132,6 @@ def compute_grads(f):
     assert torch.allclose(grad_b, b.grad)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sddmm_sputnik(device):
     B, L, M, K = 8, 30, 16, 32
@@ -165,7 +159,6 @@ def test_sddmm_sputnik(device):
 
 
 @cuda_only
-@disable_on_rocm
 @pytest.mark.parametrize("prob", [0.5, 1])
 @pytest.mark.parametrize("K", [32, 17])
 @pytest.mark.parametrize("M", [30, 17])
@@ -196,7 +189,6 @@ def test_sddmm_csr(L, M, K, prob):
 
 
 @cuda_only
-@disable_on_rocm
 @pytest.mark.parametrize("nnz", [0, 4, 16, 20, 36])
 def test_sddmm_csr_per_nnz(nnz):
     device = torch.device("cuda")
@@ -224,7 +216,6 @@ def test_sddmm_csr_per_nnz(nnz):
 
 
 @cuda_only
-@disable_on_rocm
 @pytest.mark.parametrize("prob", [0.5, 1])
 @pytest.mark.parametrize("K", [32, 17])
 @pytest.mark.parametrize("M", [30, 17])
@@ -257,7 +248,6 @@ def test_sddmm_coo(L, M, K, prob):
     assert torch.allclose(res, res_gt, atol=1e-6)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sddmm_sputnik_backward(device):
     contiguous = True
@@ -291,7 +281,6 @@ def test_sddmm_sputnik_backward(device):
     assert torch.allclose(grad_b, b.grad, atol=1e-7)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sparse_softmax_sputnik(device):
     B, L = 8, 30
@@ -314,7 +303,6 @@ def test_sparse_softmax_sputnik(device):
     assert torch.allclose(res, res_gt)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_sparse_softmax_sputnik_backward(device):
     B, L = 8, 30
@@ -337,7 +325,6 @@ def test_sparse_softmax_sputnik_backward(device):
     )
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_spmm_sputnik(device):
     B, L, K = 8, 30, 32
@@ -363,7 +350,6 @@ def test_spmm_sputnik(device):
     assert torch.allclose(res, res_gt)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_spmm_sputnik_backward(device):
     B, M, L, K = 8, 16, 30, 32

From 5bc7bbef9cb831f3189bc6aaf7ad04237ddf2ff7 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 01:11:17 +0000
Subject: [PATCH 475/641] reapply black

---
 tests/test_custom_ops.py     | 8 ++++++--
 tests/test_sparse_tensors.py | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_custom_ops.py b/tests/test_custom_ops.py
index 4d9e61890..7e8a78593 100644
--- a/tests/test_custom_ops.py
+++ b/tests/test_custom_ops.py
@@ -16,9 +16,13 @@
     _sparse_bmm,
 )
 
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available() or not torch.version.cuda, reason="requires CUDA")
+cuda_only = pytest.mark.skipif(
+    not torch.cuda.is_available() or not torch.version.cuda, reason="requires CUDA"
+)
 
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
+_devices = (
+    ["cpu", "cuda"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
+)
 
 
 def _baseline_matmul_with_sparse_mask(
diff --git a/tests/test_sparse_tensors.py b/tests/test_sparse_tensors.py
index d4ab76002..641f2ffc7 100644
--- a/tests/test_sparse_tensors.py
+++ b/tests/test_sparse_tensors.py
@@ -12,7 +12,9 @@
 from xformers.sparse import BlockSparseTensor, SparseCSRTensor
 
 cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda:0"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
+_devices = (
+    ["cpu", "cuda:0"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
+)
 _tensor_types = [BlockSparseTensor, SparseCSRTensor]
 
 
From 5b4ebe4d4c12017d10c3a29f8dfdfd1c6e2a1c86 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 01:22:04 +0000
Subject: [PATCH 476/641] cleanup test_core_attention

---
 tests/test_core_attention.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/test_core_attention.py b/tests/test_core_attention.py
index e80b0d5fe..87ad8dd5b 100644
--- a/tests/test_core_attention.py
+++ b/tests/test_core_attention.py
@@ -16,10 +16,6 @@
 
 _is_blocksparse_available = _is_triton_available()
 
-disable_on_rocm = pytest.mark.skipif(
-    not not torch.version.hip, reason="could not be done on ROCM"
-)
-
 
 def catch_oor(fn):
     @functools.wraps(fn)
@@ -35,7 +31,7 @@ def fn_and_catch_oor(*args, **kwargs):
     return fn_and_catch_oor
 
 
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_devices = ["cpu", "cuda"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
 
 
 def test_core_attention():
@@ -85,7 +81,6 @@ def test_core_attention_mask_types():
     r_dense_add = scaled_dot_product_attention(a, a, a, float_mask_add)
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_dense_no_mask(device):
     b, s, d = 8, 64, 32
@@ -99,7 +94,6 @@ def test_amp_attention_dense_no_mask(device):
     assert r.dtype == expected_device
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_dense(device):
     b, s, d = 8, 64, 32
@@ -115,7 +109,6 @@ def test_amp_attention_dense(device):
     assert r.dtype == expected_device
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_sparse(device):
     b, s, d = 8, 64, 32
@@ -132,7 +125,6 @@ def test_amp_attention_sparse(device):
     assert r.dtype == expected_device
 
 
-@disable_on_rocm
 @pytest.mark.parametrize("device", _devices)
 def test_amp_attention_sparsecs(device):
     b, s, d = 8, 64, 32
@@ -149,10 +141,10 @@ def test_amp_attention_sparsecs(device):
     assert r.dtype == expected_device
 
 
-@disable_on_rocm
 @pytest.mark.skipif(
     not _is_blocksparse_available, reason="Blocksparse is not available"
 )
+@pytest.mark.skipif(not torch.version.cuda, reason="Sparse ops not supported on ROCm")
 @pytest.mark.parametrize("device", ["cuda"])
 @pytest.mark.parametrize("data_type", [torch.float16, torch.float32])
 @catch_oor

From 473ebc7fb8bcee879e60f64cb4c6ad8355a1aec2 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 17 Feb 2024 01:27:06 +0000
Subject: [PATCH 477/641] benchmark ck ops on rocm only

---
 xformers/benchmarks/benchmark_attn_decoding.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 19c34bb8f..7ca1a99f3 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -126,14 +126,21 @@ def fw(self) -> None:
 
 BENCHMARKS = {
     "pytorch": AttentionDecodingPyTorchRepeat,
-    "ck": AttentionDecodingCK,
-    "ck-decoder": AttentionDecodingCKDecoder,
-    "ck_splitK": AttentionDecodingCKSplitKV,
 }
 
 if torch.version.cuda:
     BENCHMARKS["flash-decoding"] = AttentionDecodingFlashDecoding
 
+if torch.version.hip:
+    BENCHMARKS.update(
+        {
+            "ck": AttentionDecodingCK,
+            "ck-decoder": AttentionDecodingCKDecoder,
+            "ck_splitK": AttentionDecodingCKSplitKV,
+        }
+    )
+
+
 if (sys.version_info.major, sys.version_info.minor) >= (3, 9):
     BENCHMARKS["triton_splitK"] = AttentionDecodingSplitKV
 

From 5d3247fb63187ac325931036f0b4ca0da4384434 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 19 Feb 2024 20:02:56 +0000
Subject: [PATCH 478/641] fix mypy

---
 xformers/benchmarks/benchmark_attn_decoding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index 5025d40ce..f7f4ddf9f 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import sys
-from typing import Any
+from typing import Any, Type
 
 import torch
 from torch.utils import benchmark
@@ -127,7 +127,7 @@ def fw(self) -> None:
         return attn @ v
 
 
-BENCHMARKS = {
+BENCHMARKS : dict[str, Type[AttentionDecodingFlashDecoding]] = {
     "pytorch": AttentionDecodingPyTorchRepeat,
 }
 

From 58b0f755468054e3141b2cc0f06176648a934b1b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 21 Feb 2024 22:26:37 +0000
Subject: [PATCH 479/641] fix lint: black

---
 xformers/benchmarks/benchmark_attn_decoding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index f7f4ddf9f..e313d36cc 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -127,7 +127,7 @@ def fw(self) -> None:
         return attn @ v
 
 
-BENCHMARKS : dict[str, Type[AttentionDecodingFlashDecoding]] = {
+BENCHMARKS: dict[str, Type[AttentionDecodingFlashDecoding]] = {
     "pytorch": AttentionDecodingPyTorchRepeat,
 }
 

From 03b72945b2e78ed856827f01902513c217e0930d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 21 Feb 2024 22:29:44 +0000
Subject: [PATCH 480/641] fix lints: mypy

---
 xformers/benchmarks/benchmark_attn_decoding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/benchmarks/benchmark_attn_decoding.py b/xformers/benchmarks/benchmark_attn_decoding.py
index e313d36cc..ed457757f 100644
--- a/xformers/benchmarks/benchmark_attn_decoding.py
+++ b/xformers/benchmarks/benchmark_attn_decoding.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import sys
-from typing import Any, Type
+from typing import Any, Dict, Type
 
 import torch
 from torch.utils import benchmark
@@ -127,7 +127,7 @@ def fw(self) -> None:
         return attn @ v
 
 
-BENCHMARKS: dict[str, Type[AttentionDecodingFlashDecoding]] = {
+BENCHMARKS: Dict[str, Type[AttentionDecodingFlashDecoding]] = {
     "pytorch": AttentionDecodingPyTorchRepeat,
 }
 

From 0666088ce16745c02ad9cded907495343b3df695 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 8 Feb 2024 19:20:58 +0000
Subject: [PATCH 481/641] split-k decoder: move all tunable parameters to the
 top of cpp file

---
 .../csrc/attention/hip_fmha/CMakeLists.txt    |  2 +-
 .../hip_fmha/attention_forward_splitk.cpp     | 79 +++++++++++--------
 .../ck_attention_forward_decoder_splitk.h     | 47 +++++++----
 3 files changed, 78 insertions(+), 50 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/CMakeLists.txt b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
index 2bf65f305..97e2ab0b2 100644
--- a/xformers/csrc/attention/hip_fmha/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_fmha/CMakeLists.txt
@@ -19,7 +19,7 @@ set(project_root_dir /xformers)
 set(xformers_csrc ${project_root_dir}/xformers/csrc)
 set(sources ${xformers_csrc}/attention/hip_fmha/attention_forward_decoder.hip)
 set(splitk_sources ${xformers_csrc}/attention/hip_fmha/attention_forward_splitk.hip)
-set(ck_include ${project_root_dir}/third_party/composable_kernel/include/)
+set(ck_include ${project_root_dir}/third_party/composable_kernel_tiled/include/)
 set(torch_include /opt/conda/envs/py_${py_version}/lib/python${py_version}/site-packages/torch/include)
 
 set_source_files_properties(${sources} ${splitk_sources} PROPERTIES LANGUAGE HIP)
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 06fbbe0f6..0e9648453 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -8,8 +8,12 @@
 
 namespace {
 constexpr int32_t kThreadsPerWavefront = 64;
-constexpr int32_t kWavefrontsPerBlock = 16;
-constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
+constexpr int32_t kWavefrontsPerBlock = 4;
+constexpr int32_t kMaxHeadDimension = 4 * kThreadsPerWavefront;
+constexpr int32_t kMaxKVSequenceLength = 4096;
+constexpr int32_t kLoopUnroll = 16;
+constexpr int32_t kLoopUnrollTail = 2;
+using compute_t = float;
 } // namespace
 
 namespace {
@@ -48,13 +52,11 @@ namespace {
 
 template <
     int32_t ThreadsPerWavefront,
-    int32_t WavefrontsPerBlock,
-    int32_t KV_M_MAX = 8192,
-    int32_t K_MAX = 256>
+    int32_t WavefrontsPerBlock>
 at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
     const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_K, // [B, kMaxKVSequenceLength, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, kMaxKVSequenceLength, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int64_t split_k,
@@ -62,7 +64,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
     at::Tensor& split_sumexp,
     at::Tensor& split_O,
     at::Tensor& O) {
-  static_assert(4 * ThreadsPerWavefront == K_MAX, "");
+  static_assert(4 * ThreadsPerWavefront == kMaxHeadDimension, "");
   static_assert(WavefrontsPerBlock <= ThreadsPerWavefront, "");
 
   at::OptionalDeviceGuard guard(XQ.device());
@@ -72,8 +74,8 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
 
   TORCH_CHECK(!seq_kv_lens || seq_kv_lens->is_cuda());
 
-  TORCH_CHECK(cache_K.size(1) / split_k <= KV_M_MAX);
-  TORCH_CHECK(cache_K.size(4) <= K_MAX);
+  TORCH_CHECK(cache_K.size(1) / split_k <= kMaxKVSequenceLength);
+  TORCH_CHECK(cache_K.size(4) <= kMaxHeadDimension);
 
   constexpr auto rank = 5;
 
@@ -89,8 +91,8 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
   dim3 blocks(B * H * M * G, split_k);
   dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
-  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = K_MAX * sizeof(float) *
+  int32_t smem_softmax = kMaxKVSequenceLength * sizeof(compute_t) + WavefrontsPerBlock * sizeof(compute_t);
+  int32_t smem_output = kMaxHeadDimension * sizeof(compute_t) *
       threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
   const size_t lds_bytes = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
@@ -104,7 +106,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
       [&] {
         using ck_data_t = c10_to_data_t<scalar_t>::type;
         using device_op_t =
-            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t>;
+            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t, kMaxKVSequenceLength, kLoopUnroll, kLoopUnrollTail, compute_t>;
         auto op = device_op_t{};
 
         auto XQ_acc =
@@ -168,8 +170,8 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
 template <int32_t ThreadsPerWavefront, int32_t WavefrontsPerBlock>
 at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
     const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, H or 1, D]
+    const at::Tensor& cache_K, // [B, kMaxKVSequenceLength, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, kMaxKVSequenceLength, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int64_t split_k) {
@@ -210,8 +212,8 @@ at::Tensor efficient_attention_forward_decoder_splitk_ck_impl(
 
 at::Tensor efficient_attention_forward_decoder_splitk_ck(
     const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_K, // [B, kMaxKVSequenceLength, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, kMaxKVSequenceLength, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int64_t split_k) {
@@ -365,8 +367,8 @@ static at::Tensor split_reduce_torch(
 
 static at::Tensor efficient_attention_forward_decoder_splitk_torch(
     const at::Tensor& XQ, // [B, 1, G, H, D]
-    const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
-    const at::Tensor& cache_V, // [B, KV_M_MAX, G, H or 1, D]
+    const at::Tensor& cache_K, // [B, kMaxKVSequenceLength, G, H or 1, D]
+    const at::Tensor& cache_V, // [B, kMaxKVSequenceLength, G, H or 1, D]
     at::optional<at::Tensor> seq_kv_lens, // [B]
     double qk_scale,
     int32_t split_k,
@@ -541,16 +543,28 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
           Q_size_k_alignment_necessary == 4
               ? efficient_attention_forward_decoder_splitk_ck_kernel<
                     scalar_t,
-                    4>
+                    4,
+                    kLoopUnroll,
+                    kLoopUnrollTail,
+                    kMaxKVSequenceLength,
+                    compute_t>
               : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    1>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                        scalar_t,
+                        2,
+                        kLoopUnroll,
+                        kLoopUnrollTail,
+                        kMaxKVSequenceLength,
+                        compute_t>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                            scalar_t,
+                            1,
+                            kLoopUnroll,
+                            kLoopUnrollTail,
+                            kMaxKVSequenceLength,
+                            compute_t>
+                      : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -769,12 +783,9 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(
   dim3 blocks(B * H * M * G, split_k);
   dim3 threads(kThreadsPerWavefront, wavefronts_per_block);
 
-  constexpr int32_t KV_M_MAX = 8192;
-  constexpr int32_t K_MAX = 4 * kThreadsPerWavefront;
-
-  int32_t smem_softmax = KV_M_MAX * sizeof(float) + threads.y * sizeof(float);
-  int32_t smem_output = K_MAX * sizeof(float) *
-      threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+  int32_t smem_softmax = kMaxKVSequenceLength * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_output = kMaxHeadDimension * sizeof(float) *
+      wavefronts_per_block; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
   const size_t lds_bytes = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 9eed4f001..182876e60 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -152,11 +152,11 @@ __global__ void efficient_attention_forward_decoder_splitk_reduce_ck_kernel(
 
 template <
     typename scalar_t,
-    int32_t vec_size = 4,
-    int32_t n_loop_unroll = 16,
-    int32_t n_loop_unroll_tail = 2,
-    int32_t KV_M_MAX = 8192,
-    typename compute_t = float>
+    int32_t vec_size,
+    int32_t n_loop_unroll,
+    int32_t n_loop_unroll_tail,
+    int32_t KV_M_MAX,
+    typename compute_t>
 __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
     const scalar_t* __restrict__ XQ,
     const scalar_t* __restrict__ cache_K,
@@ -451,7 +451,12 @@ __global__ void efficient_attention_forward_decoder_splitk_ck_kernel(
 namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename scalar_t, typename compute_t = float>
+template <
+    typename scalar_t,
+    int32_t KV_M_MAX,
+    int32_t n_loop_unroll,
+    int32_t n_loop_unroll_tail,
+    typename compute_t>
 struct FMHADecoderSplitKDeviceOp : public BaseOperator {
   using DeviceOp = FMHADecoderSplitKDeviceOp;
   struct Argument : public BaseArgument {
@@ -611,16 +616,28 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
           Q_size_k_alignment_necessary == 4
               ? efficient_attention_forward_decoder_splitk_ck_kernel<
                     scalar_t,
-                    4>
+                    /* vec_size */ 4,
+                    n_loop_unroll,
+                    n_loop_unroll_tail,
+                    KV_M_MAX,
+                    compute_t>
               : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    1>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                        scalar_t,
+                        /* vec_size */ 2,
+                        n_loop_unroll,
+                        n_loop_unroll_tail,
+                        KV_M_MAX,
+                        compute_t>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                            scalar_t,
+                            /* vec_size */ 1,
+                            n_loop_unroll,
+                            n_loop_unroll_tail,
+                            KV_M_MAX,
+                            compute_t>
+                      : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,

From 04eec8d85be9772b904d78f5e66af96ef8b0bf76 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 21 Feb 2024 22:18:02 +0000
Subject: [PATCH 482/641] apply clang-format

---
 .../hip_fmha/attention_forward_splitk.cpp     | 48 +++++++++++--------
 .../ck_attention_forward_decoder_splitk.h     | 32 ++++++-------
 2 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 0e9648453..ea4e3505f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -91,7 +91,8 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
   dim3 blocks(B * H * M * G, split_k);
   dim3 threads(ThreadsPerWavefront, WavefrontsPerBlock);
 
-  int32_t smem_softmax = kMaxKVSequenceLength * sizeof(compute_t) + WavefrontsPerBlock * sizeof(compute_t);
+  int32_t smem_softmax = kMaxKVSequenceLength * sizeof(compute_t) +
+      WavefrontsPerBlock * sizeof(compute_t);
   int32_t smem_output = kMaxHeadDimension * sizeof(compute_t) *
       threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
   const size_t lds_bytes = max(smem_softmax, smem_output);
@@ -106,7 +107,12 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
       [&] {
         using ck_data_t = c10_to_data_t<scalar_t>::type;
         using device_op_t =
-            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<ck_data_t, kMaxKVSequenceLength, kLoopUnroll, kLoopUnrollTail, compute_t>;
+            ck::tensor_operation::device::FMHADecoderSplitKDeviceOp<
+                ck_data_t,
+                kMaxKVSequenceLength,
+                kLoopUnroll,
+                kLoopUnrollTail,
+                compute_t>;
         auto op = device_op_t{};
 
         auto XQ_acc =
@@ -549,22 +555,22 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
                     kMaxKVSequenceLength,
                     compute_t>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        2,
-                        kLoopUnroll,
-                        kLoopUnrollTail,
-                        kMaxKVSequenceLength,
-                        compute_t>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            1,
-                            kLoopUnroll,
-                            kLoopUnrollTail,
-                            kMaxKVSequenceLength,
-                            compute_t>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    2,
+                    kLoopUnroll,
+                    kLoopUnrollTail,
+                    kMaxKVSequenceLength,
+                    compute_t>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    1,
+                    kLoopUnroll,
+                    kLoopUnrollTail,
+                    kMaxKVSequenceLength,
+                    compute_t>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -783,9 +789,11 @@ static std::tuple<at::Tensor, at::Tensor, at::Tensor> split_attention_hip(
   dim3 blocks(B * H * M * G, split_k);
   dim3 threads(kThreadsPerWavefront, wavefronts_per_block);
 
-  int32_t smem_softmax = kMaxKVSequenceLength * sizeof(float) + threads.y * sizeof(float);
+  int32_t smem_softmax =
+      kMaxKVSequenceLength * sizeof(float) + threads.y * sizeof(float);
   int32_t smem_output = kMaxHeadDimension * sizeof(float) *
-      wavefronts_per_block; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
+      wavefronts_per_block; // 4 * threadsPerBlock * sizeof(float) ==
+                            // sizeof(O[b][0][h][:])
   const size_t lds_bytes = max(smem_softmax, smem_output);
   auto stream = at::cuda::getCurrentHIPStream().stream();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 182876e60..65c27603d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -622,22 +622,22 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     KV_M_MAX,
                     compute_t>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        /* vec_size */ 2,
-                        n_loop_unroll,
-                        n_loop_unroll_tail,
-                        KV_M_MAX,
-                        compute_t>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            /* vec_size */ 1,
-                            n_loop_unroll,
-                            n_loop_unroll_tail,
-                            KV_M_MAX,
-                            compute_t>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    /* vec_size */ 2,
+                    n_loop_unroll,
+                    n_loop_unroll_tail,
+                    KV_M_MAX,
+                    compute_t>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    /* vec_size */ 1,
+                    n_loop_unroll,
+                    n_loop_unroll_tail,
+                    KV_M_MAX,
+                    compute_t>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,

From a02ab9b9e5b81d732dac52f334d142247c7f085e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 22 Feb 2024 14:50:28 +0000
Subject: [PATCH 483/641] Rename HDim/headdim to MaxK/maxk

---
 .../hip_fmha/ck_tiled_fmha_batched_forward.h    | 14 +++++++-------
 .../ck_tiled_fmha_batched_forward_bp16.cpp      |  6 +++---
 .../ck_tiled_fmha_batched_forward_fp16.cpp      |  6 +++---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h      | 14 +++++++-------
 .../ck_tiled_fmha_batched_infer_bp16.cpp        |  6 +++---
 .../ck_tiled_fmha_batched_infer_fp16.cpp        |  6 +++---
 .../hip_fmha/ck_tiled_fmha_definitions.h        |  4 ++--
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h    | 17 ++++++++---------
 .../ck_tiled_fmha_grouped_forward_bp16.cpp      |  6 +++---
 .../ck_tiled_fmha_grouped_forward_fp16.cpp      |  6 +++---
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h      | 14 +++++++-------
 .../ck_tiled_fmha_grouped_infer_bp16.cpp        |  6 +++---
 .../ck_tiled_fmha_grouped_infer_fp16.cpp        |  6 +++---
 ...bp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...bp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._bp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...fp16_no_causalmask_no_attnbias_maxk_256.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_32.cpp} |  0
 ..._fp16_no_causalmask_no_attnbias_maxk_64.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_128.cpp} |  0
 ...16_no_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...p16_no_causalmask_with_attnbias_maxk_64.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_128.cpp} |  0
 ...16_with_causalmask_no_attnbias_maxk_256.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_32.cpp} |  0
 ...p16_with_causalmask_no_attnbias_maxk_64.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_128.cpp} |  0
 ..._with_causalmask_with_attnbias_maxk_256.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_32.cpp} |  0
 ...6_with_causalmask_with_attnbias_maxk_64.cpp} |  0
 141 files changed, 55 insertions(+), 56 deletions(-)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp => ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp} (100%)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 8cdba0763..ccbfd2d86 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -38,7 +38,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 struct batched_forward_causalmask_attnbias_dispatched {
   using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
       typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -57,7 +57,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-          FmhaFwdShape<HDim>,
+          FmhaFwdShape<MaxK>,
           false, // kIsGroupMode
           FmhaMask,
           FmhaTraits>;
@@ -71,17 +71,17 @@ struct batched_forward_causalmask_attnbias_dispatched {
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
       constexpr ck::index_t occupancy =
-          (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
+          (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
       bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
       bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
-      if constexpr (HDim == 256) {
+      if constexpr (MaxK == 256) {
         BOOL_SWITCH_4(
             pad_seqlen_q,
             kPadSeqLenQ,
@@ -221,7 +221,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 void run_batched_forward_causalmask_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
@@ -229,5 +229,5 @@ void run_batched_forward_causalmask_attnbias_dispatched(
       scalar_t,
       has_causal_mask,
       has_attn_bias,
-      HDim>::Run(param, stream);
+      MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
index 749c80a77..8d90c7cd5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -51,19 +51,19 @@ extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhal
 
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index c65f7fedc..3e6584971 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -51,19 +51,19 @@ extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 0d72fde9f..af3ded107 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -38,7 +38,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 struct batched_infer_causalmask_attnbias_dispatched {
   using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
       typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -57,7 +57,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-          FmhaFwdShape<HDim>,
+          FmhaFwdShape<MaxK>,
           false, // kIsGroupMode
           FmhaMask,
           FmhaTraits>;
@@ -71,17 +71,17 @@ struct batched_infer_causalmask_attnbias_dispatched {
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
       constexpr ck::index_t occupancy =
-          (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
+          (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
       bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
       bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 
-      if constexpr (HDim == 256) {
+      if constexpr (MaxK == 256) {
         BOOL_SWITCH_4(
             pad_seqlen_q,
             kPadSeqLenQ,
@@ -221,7 +221,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 void run_batched_infer_causalmask_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
@@ -229,5 +229,5 @@ void run_batched_infer_causalmask_attnbias_dispatched(
       scalar_t,
       has_causal_mask,
       has_attn_bias,
-      HDim>::Run(param, stream);
+      MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index f0a4edd84..f4a2e064e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -51,19 +51,19 @@ extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index b25041fdf..653cfacbd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -51,19 +51,19 @@ extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
index a20a8b5bd..4e3767fd2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
@@ -47,7 +47,7 @@ struct FmhaFwdTypeConfig<ck::bhalf_t> {
   using ODataType = ck::bhalf_t;
 };
 
-template <ck::index_t HDim>
+template <ck::index_t MaxK>
 struct FmhaFwdBlockTile;
 
 template <>
@@ -75,7 +75,7 @@ using FmhaFwdWarpTile = ck::Sequence<32, 32, 16>;
 
 static constexpr bool IsVLayoutRowMajor = true;
 
-template <ck::index_t HDim>
+template <ck::index_t MaxK>
 struct FmhaFwdShape;
 
 template <>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 626857121..a79b3c1ef 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -38,7 +38,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 struct grouped_forward_causalmask_attnbias_dispatched {
   using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
       typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -57,7 +57,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-          FmhaFwdShape<HDim>,
+          FmhaFwdShape<MaxK>,
           true, // kIsGroupMode
           FmhaMask,
           FmhaTraits>;
@@ -71,11 +71,10 @@ struct grouped_forward_causalmask_attnbias_dispatched {
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-      constexpr ck::index_t occupancy = (HDim == 64) ? 3
-          : (HDim == 256)                            ? 1
-                                                     : 2;
+      constexpr ck::index_t occupancy =
+          (MaxK == 64) ? 3 : (MaxK == 256) ? 1 : 2;
 
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
@@ -83,7 +82,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
-      if constexpr (HDim == 256) {
+      if constexpr (MaxK == 256) {
         BOOL_SWITCH_2(
             pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
               using FmhaTraits = ck::tile_program::TileFmhaTraits<
@@ -188,7 +187,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 void run_grouped_forward_causalmask_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
@@ -196,5 +195,5 @@ void run_grouped_forward_causalmask_attnbias_dispatched(
       scalar_t,
       has_causal_mask,
       has_attn_bias,
-      HDim>::Run(param, stream);
+      MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
index db313f3ef..b417156f5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -51,19 +51,19 @@ extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhal
 
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index 2e807d3a5..b7c278c53 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -51,19 +51,19 @@ extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 11b2857fd..37be384c7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -38,7 +38,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 struct grouped_infer_causalmask_attnbias_dispatched {
   using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
       typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -57,7 +57,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-          FmhaFwdShape<HDim>,
+          FmhaFwdShape<MaxK>,
           true, // kIsGroupMode
           FmhaMask,
           FmhaTraits>;
@@ -71,10 +71,10 @@ struct grouped_infer_causalmask_attnbias_dispatched {
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-      using FmhaShape = FmhaFwdShape<HDim>;
+      using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
       constexpr ck::index_t occupancy =
-          (HDim == 64) ? 3 : ((HDim == 256) ? 1 : 2);
+          (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
@@ -82,7 +82,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
-      if constexpr (HDim == 256) {
+      if constexpr (MaxK == 256) {
         BOOL_SWITCH_2(
             pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
               using FmhaTraits = ck::tile_program::TileFmhaTraits<
@@ -187,7 +187,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
-    ck::index_t HDim>
+    ck::index_t MaxK>
 void run_grouped_infer_causalmask_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
@@ -195,5 +195,5 @@ void run_grouped_infer_causalmask_attnbias_dispatched(
       scalar_t,
       has_causal_mask,
       has_attn_bias,
-      HDim>::Run(param, stream);
+      MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index ce95de00c..7ee53261d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -51,19 +51,19 @@ extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 830176e68..2d03119db 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -51,19 +51,19 @@ extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, HDim, [&] {
+    FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
             HAS_ATTN_BIAS,
-            HDim>(param, stream);
+            MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
     });
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_headdim_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp

From fd3672539b49a9f3ce540edf92d929c241f44749 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 22 Feb 2024 15:38:57 +0000
Subject: [PATCH 484/641] Move some headers files to ck examples for later
 reusing

---
 setup.py                                      |   4 +
 third_party/composable_kernel_tiled           |   2 +-
 .../hip_fmha/ck_tiled_fmha_forward_kernel.h   | 664 ------------------
 .../hip_fmha/ck_tiled_fmha_fwd_epilogue.h     |  40 --
 .../ck_tiled_fmha_fwd_tile_partitioner.h      |  56 --
 5 files changed, 5 insertions(+), 761 deletions(-)
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
 delete mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h

diff --git a/setup.py b/setup.py
index a2f15b020..73582fa86 100644
--- a/setup.py
+++ b/setup.py
@@ -356,6 +356,10 @@ def get_extensions():
             Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
         ]
 
+        include_dirs += [
+            Path(this_dir) / "third_party" / "composable_kernel_tiled" / "example" / "91_tile_program" / "xformers_fmha"
+        ]
+
         include_dirs += [
             Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
         ]
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 03d1d1ad9..b34434327 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 03d1d1ad9e0cc3c8e5d800d106bbdebe877e6e88
+Subproject commit b344343273cf6731ba0a47e061629890a8014af5
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
deleted file mode 100644
index 58abc9efa..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_forward_kernel.h
+++ /dev/null
@@ -1,664 +0,0 @@
-/*
- * Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <type_traits>
-
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/tile/tile_window.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include "ck_tiled_fmha_definitions.h"
-
-// S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] * K[seqlen_k, hdim_q]
-// S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
-// S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
-// P[seqlen_q, seqlen_k] = Softmax(S[seqlen_q, seqlen_k])
-// O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] * V[hdim_v, seqlen_k]
-
-template <
-    typename TilePartitioner_,
-    typename FmhaPipeline_,
-    typename EpiloguePipeline_>
-struct FmhaFwdKernel {
-  using TilePartitioner = ck::remove_cvref_t<TilePartitioner_>;
-  using FmhaPipeline = ck::remove_cvref_t<FmhaPipeline_>;
-  using EpiloguePipeline = ck::remove_cvref_t<EpiloguePipeline_>;
-  static constexpr ck::index_t kBlockSize = FmhaPipeline::kBlockSize;
-  static constexpr ck::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
-
-  using QDataType = ck::remove_cvref_t<typename FmhaPipeline::QDataType>;
-  using KDataType = ck::remove_cvref_t<typename FmhaPipeline::KDataType>;
-  using VDataType = ck::remove_cvref_t<typename FmhaPipeline::VDataType>;
-  using BiasDataType = ck::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
-  using LSEDataType = ck::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
-  using ODataType = ck::remove_cvref_t<typename FmhaPipeline::ODataType>;
-
-  using VLayout = ck::remove_cvref_t<typename FmhaPipeline::VLayout>;
-
-  static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
-  static constexpr bool kPadSeqLenQ = FmhaPipeline::kPadSeqLenQ;
-  static constexpr bool kPadSeqLenK = FmhaPipeline::kPadSeqLenK;
-  static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
-  static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
-  static constexpr bool kHasBias = FmhaPipeline::kHasBias;
-  static constexpr bool kStoreLSE = FmhaPipeline::kStoreLSE;
-  using FmhaMask = ck::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
-  static constexpr bool kHasMask = FmhaMask::IsMasking;
-
-  template <ck::index_t I> // to avoid duplicated base class prblem, introduce
-                           // an template arg
-  struct FmhaFwdEmptyKargs {};
-
-  // kargs use aggregate initializer, so no constructor will provided
-  // use inheritance to minimize karg size
-  // user need to use MakeKargs() function to create kargs.
-  struct FmhaFwdCommonKargs {
-    const void* q_ptr;
-    const void* k_ptr;
-    const void* v_ptr;
-    void* o_ptr;
-
-    ck::index_t seqlen_q;
-    ck::index_t seqlen_k;
-    ck::index_t hdim_q;
-    ck::index_t hdim_v;
-
-    // for MQA/GQA, nhead could be different. This parameter is nhead_q /
-    // nhead_k if this param is larger than 1, indicate MQA/GQA case
-    ck::index_t nhead_ratio_qk;
-    float scale;
-
-    ck::index_t stride_q;
-    ck::index_t stride_k;
-    ck::index_t stride_v;
-    ck::index_t stride_o;
-
-    ck::index_t nhead_stride_q;
-    ck::index_t nhead_stride_k;
-    ck::index_t nhead_stride_v;
-    ck::index_t nhead_stride_o;
-  };
-
-  struct FmhaFwdCommonBiasKargs {
-    const void* bias_ptr = nullptr;
-    ck::index_t stride_bias = 0;
-    ck::index_t nhead_stride_bias = 0;
-  };
-
-  struct FmhaFwdBatchModeBiasKargs : FmhaFwdCommonBiasKargs {
-    ck::index_t batch_stride_bias = 0;
-  };
-
-  struct FmhaFwdMaskKargs {
-    CausalMaskType mask_type;
-    ck::index_t window_size;
-  };
-
-  struct FmhaFwdCommonLSEKargs {
-    void* lse_ptr = nullptr;
-    ck::index_t nhead_stride_lse = 0;
-  };
-
-  struct FmhaFwdBatchModeLSEKargs : FmhaFwdCommonLSEKargs {
-    ck::index_t batch_stride_lse = 0;
-  };
-
-  struct FmhaFwdBatchModeKargs
-      : FmhaFwdCommonKargs,
-        std::conditional_t<
-            kHasBias,
-            FmhaFwdBatchModeBiasKargs,
-            FmhaFwdEmptyKargs<0>>,
-        std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
-        std::conditional_t<
-            kStoreLSE,
-            FmhaFwdBatchModeLSEKargs,
-            FmhaFwdEmptyKargs<2>> {
-    ck::index_t batch_stride_q;
-    ck::index_t batch_stride_k;
-    ck::index_t batch_stride_v;
-    ck::index_t batch_stride_o;
-  };
-
-  struct FmhaFwdGroupModeKargs
-      : FmhaFwdCommonKargs,
-        std::conditional_t<
-            kHasBias,
-            FmhaFwdCommonBiasKargs,
-            FmhaFwdEmptyKargs<0>>,
-        std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
-        std::conditional_t<
-            kStoreLSE,
-            FmhaFwdCommonLSEKargs,
-            FmhaFwdEmptyKargs<2>> {
-    const int32_t* seqstart_q_ptr;
-    const int32_t* seqstart_k_ptr;
-    const int32_t* seqlen_k_ptr;
-  };
-
-  using Kargs = std::
-      conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
-
-  template <bool Cond = !kIsGroupMode>
-  __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(
-      const void* q_ptr,
-      const void* k_ptr,
-      const void* v_ptr,
-      const void* bias_ptr,
-      void* lse_ptr,
-      void* o_ptr,
-      ck::index_t seqlen_q,
-      ck::index_t seqlen_k,
-      ck::index_t hdim_q,
-      ck::index_t hdim_v,
-      ck::index_t nhead_ratio_qk,
-      float scale,
-      ck::index_t stride_q,
-      ck::index_t stride_k,
-      ck::index_t stride_v,
-      ck::index_t stride_bias,
-      ck::index_t stride_o,
-      ck::index_t nhead_stride_q,
-      ck::index_t nhead_stride_k,
-      ck::index_t nhead_stride_v,
-      ck::index_t nhead_stride_bias,
-      ck::index_t nhead_stride_lse,
-      ck::index_t nhead_stride_o,
-      ck::index_t batch_stride_q,
-      ck::index_t batch_stride_k,
-      ck::index_t batch_stride_v,
-      ck::index_t batch_stride_bias,
-      ck::index_t batch_stride_lse,
-      ck::index_t batch_stride_o,
-      CausalMaskType mask_type,
-      ck::index_t window_size) {
-    Kargs kargs{
-        {q_ptr,
-         k_ptr,
-         v_ptr,
-         o_ptr,
-         seqlen_q,
-         seqlen_k,
-         hdim_q,
-         hdim_v,
-         nhead_ratio_qk,
-#if CK_FMHA_FWD_FAST_EXP2
-         static_cast<float>(scale * ck::math::log2e_v<>),
-#else
-         scale,
-#endif
-         stride_q,
-         stride_k,
-         stride_v,
-         stride_o,
-         nhead_stride_q,
-         nhead_stride_k,
-         nhead_stride_v,
-         nhead_stride_o}, // args for common karg
-        {}, // placeholder for bias
-        {}, // placeholder for mask
-        {}, // placeholder for lse
-        batch_stride_q,
-        batch_stride_k,
-        batch_stride_v,
-        batch_stride_o};
-
-    if constexpr (kHasBias) {
-      kargs.bias_ptr = bias_ptr;
-      kargs.stride_bias = stride_bias;
-      kargs.nhead_stride_bias = nhead_stride_bias;
-      kargs.batch_stride_bias = batch_stride_bias;
-    }
-
-    if constexpr (kHasMask) {
-      kargs.mask_type = mask_type;
-      kargs.window_size = window_size;
-    }
-    if constexpr (kStoreLSE) {
-      kargs.lse_ptr = lse_ptr;
-      kargs.nhead_stride_lse = nhead_stride_lse;
-      kargs.batch_stride_lse = batch_stride_lse;
-    }
-
-    return kargs;
-  }
-
-  template <bool Cond = kIsGroupMode>
-  __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(
-      const void* q_ptr,
-      const void* k_ptr,
-      const void* v_ptr,
-      const void* bias_ptr,
-      void* lse_ptr,
-      void* o_ptr,
-      const void* seqstart_q_ptr,
-      const void* seqstart_k_ptr,
-      const void* seqlen_k_ptr,
-      ck::index_t hdim_q,
-      ck::index_t hdim_v,
-      ck::index_t nhead_ratio_qk,
-      float scale,
-      ck::index_t stride_q,
-      ck::index_t stride_k,
-      ck::index_t stride_v,
-      ck::index_t stride_bias,
-      ck::index_t stride_o,
-      ck::index_t nhead_stride_q,
-      ck::index_t nhead_stride_k,
-      ck::index_t nhead_stride_v,
-      ck::index_t nhead_stride_bias,
-      ck::index_t nhead_stride_lse,
-      ck::index_t nhead_stride_o,
-      CausalMaskType mask_type,
-      ck::index_t window_size) {
-    Kargs kargs{
-        {q_ptr,
-         k_ptr,
-         v_ptr,
-         o_ptr,
-         -1, // seqlen will be updated by another pointer
-         -1, //
-         hdim_q,
-         hdim_v,
-         nhead_ratio_qk,
-#if CK_FMHA_FWD_FAST_EXP2
-         static_cast<float>(scale * ck::math::log2e_v<>),
-#else
-         scale,
-#endif
-         stride_q,
-         stride_k,
-         stride_v,
-         stride_o,
-         nhead_stride_q,
-         nhead_stride_k,
-         nhead_stride_v,
-         nhead_stride_o}, // args for common karg
-        {}, // placeholder for bias
-        {}, // placeholder for mask
-        {}, // placeholder for lse
-        reinterpret_cast<const int32_t*>(seqstart_q_ptr),
-        reinterpret_cast<const int32_t*>(seqstart_k_ptr),
-        reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
-
-    if constexpr (kHasBias) {
-      kargs.bias_ptr = bias_ptr;
-      kargs.stride_bias = stride_bias;
-      kargs.nhead_stride_bias = nhead_stride_bias;
-    }
-    if constexpr (kHasMask) {
-      kargs.mask_type = mask_type;
-      kargs.window_size = window_size;
-    }
-    if constexpr (kStoreLSE) {
-      kargs.lse_ptr = lse_ptr;
-      kargs.nhead_stride_lse = nhead_stride_lse;
-    }
-
-    return kargs;
-  }
-
-  __host__ static constexpr auto GridSize(
-      ck::index_t batch_size_,
-      ck::index_t nhead_,
-      ck::index_t seqlen_q_,
-      ck::index_t hdim_v_) {
-    return TilePartitioner::GridSize(batch_size_, nhead_, seqlen_q_, hdim_v_);
-  }
-
-  __host__ static constexpr auto BlockSize() {
-    return dim3(kBlockSize);
-  }
-
-  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
-    return ck::math::max(
-        FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
-  }
-
-  __device__ void operator()(Kargs kargs) const {
-    using namespace ck;
-    using namespace ck::tile_program;
-    using namespace ck::tile_program::block;
-
-    // allocate LDS
-    __shared__ char smem_ptr[GetSmemSize()];
-
-    // divide problem
-    const auto [i_tile_m, i_tile_n, i_nhead, i_batch] =
-        TilePartitioner{}(kargs.seqlen_q, kargs.hdim_v);
-
-    const index_t i_m0 =
-        __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-    const index_t i_n1 =
-        __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-    long_index_t batch_offset_q = 0;
-    long_index_t batch_offset_k = 0;
-    long_index_t batch_offset_v = 0;
-    long_index_t batch_offset_bias = 0;
-    long_index_t batch_offset_lse = 0;
-    long_index_t batch_offset_o = 0;
-
-    if constexpr (kIsGroupMode) {
-      // get starting offset for each batch
-      const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-      const long_index_t key_start = kargs.seqstart_k_ptr[i_batch];
-
-      batch_offset_q = query_start * kargs.stride_q;
-      batch_offset_k = key_start * kargs.stride_k;
-      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
-        batch_offset_v = key_start * kargs.stride_v;
-      } else {
-        batch_offset_v = key_start;
-      }
-      if constexpr (kHasBias) {
-        batch_offset_bias = query_start * kargs.stride_bias + key_start;
-      } else {
-        batch_offset_bias = key_start;
-      }
-      if constexpr (kStoreLSE) {
-        batch_offset_lse = query_start;
-      }
-      batch_offset_o = query_start * kargs.stride_o;
-
-      // get real # queries & # keys under group mode
-      const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-      kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
-
-      // # of required blocks is different in each groups, terminate unnecessary
-      // blocks earlier
-      if (kargs.seqlen_q <= i_m0) {
-        return;
-      }
-
-      if (kargs.seqlen_k_ptr != nullptr) {
-        kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
-      } else {
-        const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-        kargs.seqlen_k =
-            adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
-      }
-    } else {
-      batch_offset_q =
-          static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-      batch_offset_k =
-          static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-      batch_offset_v =
-          static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-      if constexpr (kHasBias) {
-        batch_offset_bias =
-            static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
-      }
-      if constexpr (kStoreLSE) {
-        batch_offset_lse =
-            static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-      }
-      batch_offset_o =
-          static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-    }
-
-    // for simplicity, batch stride we just modify the pointer
-    const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-        static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-        batch_offset_q;
-    const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-        static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) *
-            kargs.nhead_stride_k +
-        batch_offset_k;
-    const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-        static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) *
-            kargs.nhead_stride_v +
-        batch_offset_v;
-    ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-        static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-        batch_offset_o;
-
-    // Q/K/V DRAM and DRAM window
-    const auto q_dram = [&]() {
-      const auto q_dram_naive =
-          make_naive_tensor_view<AddressSpaceEnum::Global>(
-              q_ptr,
-              make_tuple(kargs.seqlen_q, kargs.hdim_q),
-              make_tuple(kargs.stride_q, 1),
-              Number<32>{},
-              Number<1>{});
-      if constexpr (FmhaPipeline::kQLoadOnce) {
-        return pad_tensor_view(
-            q_dram_naive,
-            make_tuple(
-                Number<FmhaPipeline::kM0>{},
-                Number<FmhaPipeline::kK0BlockLength>{}),
-            Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-      } else {
-        return pad_tensor_view(
-            q_dram_naive,
-            make_tuple(
-                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{}),
-            Sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-      }
-    }();
-    const auto k_dram = [&]() {
-      const auto k_dram_naive =
-          make_naive_tensor_view<AddressSpaceEnum::Global>(
-              k_ptr,
-              make_tuple(kargs.seqlen_k, kargs.hdim_q),
-              make_tuple(kargs.stride_k, 1),
-              Number<32>{},
-              Number<1>{});
-
-      return pad_tensor_view(
-          k_dram_naive,
-          make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
-          Sequence<kPadSeqLenK, kPadHeadDimQ>{});
-    }();
-    const auto v_dram = [&]() {
-      if constexpr (ck::is_same_v<VLayout, ck::tensor_layout::gemm::RowMajor>) {
-        const auto v_dram_naive =
-            make_naive_tensor_view<AddressSpaceEnum::Global>(
-                v_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                make_tuple(kargs.stride_v, 1),
-                Number<32>{},
-                Number<1>{});
-
-        const auto v_dram_transposed = transform_tensor_view(
-            v_dram_naive,
-            make_tuple(
-                make_pass_through_transform(kargs.seqlen_k),
-                make_pass_through_transform(kargs.hdim_v)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-        return pad_tensor_view(
-            v_dram_transposed,
-            make_tuple(
-                Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-            Sequence<kPadHeadDimV, kPadSeqLenK>{});
-      } else {
-        const auto v_dram_naive =
-            make_naive_tensor_view<AddressSpaceEnum::Global>(
-                v_ptr,
-                make_tuple(kargs.hdim_v, kargs.seqlen_k),
-                make_tuple(kargs.stride_v, 1),
-                Number<32>{},
-                Number<1>{});
-
-        return pad_tensor_view(
-            v_dram_naive,
-            make_tuple(
-                Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-            Sequence<kPadHeadDimV, kPadSeqLenK>{});
-      }
-    }();
-
-    auto q_dram_window = make_tile_window(
-        q_dram,
-        [&]() {
-          if constexpr (FmhaPipeline::kQLoadOnce)
-            return make_tuple(
-                Number<FmhaPipeline::kM0>{},
-                Number<FmhaPipeline::kK0BlockLength>{});
-          else
-            return make_tuple(
-                Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kK0>{});
-        }(),
-        {i_m0, 0});
-
-    auto k_dram_window = make_tile_window(
-        k_dram,
-        make_tuple(Number<FmhaPipeline::kN0>{}, Number<FmhaPipeline::kK0>{}),
-        {0, 0});
-
-    auto v_dram_window = make_tile_window(
-        v_dram,
-        make_tuple(Number<FmhaPipeline::kN1>{}, Number<FmhaPipeline::kK1>{}),
-        {i_n1, 0});
-    /// FIXME: Before C++20, capturing structured binding variables is not
-    /// supported. Remove following copy capture of the 'i_nhead'
-    ///        if compiled in C++20
-    const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-      constexpr auto bias_dram_window_lengths =
-          make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN0>{});
-      if constexpr (kHasBias) {
-        const BiasDataType* bias_ptr =
-            reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-            static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-            batch_offset_bias;
-
-        const auto bias_dram = [&]() {
-          const auto bias_dram_naive =
-              make_naive_tensor_view<AddressSpaceEnum::Global>(
-                  bias_ptr,
-                  make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                  make_tuple(kargs.stride_bias, 1),
-                  Number<32>{},
-                  Number<1>{});
-
-          return pad_tensor_view(
-              bias_dram_naive,
-              bias_dram_window_lengths,
-              Sequence<kPadSeqLenQ, kPadSeqLenK>{});
-        }();
-
-        return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-      } else {
-        return make_null_tile_window(bias_dram_window_lengths);
-      }
-    }();
-
-    // lse
-    auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-      constexpr auto lse_dram_window_lengths =
-          make_tuple(Number<FmhaPipeline::kM0>{});
-      if constexpr (kStoreLSE) {
-        LSEDataType* lse_ptr = reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-            static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
-            batch_offset_lse;
-
-        const auto lse_dram = [&]() {
-          const auto lse_dram_naive =
-              make_naive_tensor_view<AddressSpaceEnum::Global>(
-                  lse_ptr,
-                  make_tuple(kargs.seqlen_q),
-                  make_tuple(1),
-                  Number<1>{},
-                  Number<1>{});
-
-          return pad_tensor_view(
-              lse_dram_naive, lse_dram_window_lengths, Sequence<kPadSeqLenQ>{});
-        }();
-
-        return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-      } else {
-        return make_null_tile_window(lse_dram_window_lengths);
-      }
-    }();
-
-    FmhaMask mask = [&]() {
-      if constexpr (kHasMask) {
-        auto res = ck::make_tuple(
-            ck::index_t{0}, ck::index_t{0}, ck::index_t{0}, ck::index_t{0});
-
-        if (kargs.window_size > 0) {
-          if (kargs.mask_type == CausalMaskType::MaskDisabled) {
-            ck::index_t left_size = kargs.window_size / 2;
-            ck::index_t right_size = kargs.window_size - 1 - left_size;
-
-            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                left_size, right_size, kargs.seqlen_q, kargs.seqlen_k);
-          } else {
-            bool is_topleft =
-                (kargs.mask_type ==
-                 CausalMaskType::MaskUpperTriangleFromTopLeft);
-
-            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                kargs.window_size - 1,
-                0,
-                kargs.seqlen_q,
-                kargs.seqlen_k,
-                is_topleft);
-          }
-        } else {
-          if (kargs.mask_type == CausalMaskType::MaskDisabled) {
-            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                -1, -1, kargs.seqlen_q, kargs.seqlen_k);
-          } else {
-            bool is_topleft =
-                (kargs.mask_type ==
-                 CausalMaskType::MaskUpperTriangleFromTopLeft);
-
-            res = ck::make_generic_attention_mask_coordinates_from_lr_window(
-                -1, 0, kargs.seqlen_q, kargs.seqlen_k, is_topleft);
-          }
-        }
-
-        auto y = res.At(ck::Number<0>{});
-        auto x = res.At(ck::Number<1>{});
-
-        return FmhaMask{y, x, kargs.seqlen_q, kargs.seqlen_k};
-      } else
-        return FmhaMask{0, 0, kargs.seqlen_q, kargs.seqlen_k};
-    }();
-
-    auto o_acc_tile = FmhaPipeline{}(
-        q_dram_window,
-        k_dram_window,
-        v_dram_window,
-        bias_dram_window,
-        lse_dram_window,
-        mask,
-        kargs.scale,
-        // ck::math::integer_divide_ceil(kargs.seqlen_k, FmhaPipeline::kN0),
-        // ck::math::integer_divide_ceil(kargs.hdim_q, FmhaPipeline::kK0),
-        smem_ptr);
-
-    // O DRAM and O DRAM window
-    auto o_dram = [&]() {
-      const auto o_dram_naive =
-          make_naive_tensor_view<AddressSpaceEnum::Global>(
-              o_ptr,
-              make_tuple(kargs.seqlen_q, kargs.hdim_v),
-              make_tuple(kargs.stride_o, 1),
-              Number<32>{},
-              Number<1>{});
-
-      return pad_tensor_view(
-          o_dram_naive,
-          make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-          Sequence<kPadSeqLenQ, kPadHeadDimV>{});
-    }();
-
-    auto o_dram_window = make_tile_window(
-        o_dram,
-        make_tuple(Number<FmhaPipeline::kM0>{}, Number<FmhaPipeline::kN1>{}),
-        {i_m0, i_n1});
-
-    EpiloguePipeline{}(o_dram_window, o_acc_tile);
-  }
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
deleted file mode 100644
index 9dde0c97c..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_epilogue.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include "ck/tile_program/tile/store_tile.hpp"
-#include "ck/tile_program/tile/tile_elementwise.hpp"
-#include "ck/utility/common_header.hpp"
-
-template <typename OaccDataType_, typename ODataType_>
-struct FmhaFwdEpilogueProblem {
-  using OaccDataType = ck::remove_cvref_t<OaccDataType_>;
-  using ODataType = ck::remove_cvref_t<ODataType_>;
-};
-
-template <typename Problem_, typename Policy_ = void>
-struct FmhaFwdEpilogue {
-  using Problem = ck::remove_cvref_t<Problem_>;
-  using OaccDataType = ck::remove_cvref_t<typename Problem::OaccDataType>;
-  using ODataType = ck::remove_cvref_t<typename Problem::ODataType>;
-
-  __host__ __device__ static constexpr ck::index_t GetSmemSize() {
-    return 0;
-  }
-
-  template <typename ODramWindowTmp, typename OAccTile>
-  __device__ auto operator()(
-      ODramWindowTmp& o_dram_window_tmp,
-      const OAccTile& o_acc_tile) {
-    using namespace ck;
-    using namespace ck::tile_program;
-
-    const auto o =
-        tile_elementwise_in(type_convert<ODataType, OaccDataType>, o_acc_tile);
-    store_tile(o_dram_window_tmp, o);
-  }
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
deleted file mode 100644
index 34537d707..000000000
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_tile_partitioner.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include "ck/tile_program/tile/store_tile.hpp"
-#include "ck/tile_program/tile/tile_elementwise.hpp"
-#include "ck/utility/common_header.hpp"
-
-template <typename BlockFmhaShape_>
-struct FmhaFwdTilePartitioner {
-  using BlockFmhaShape = ck::remove_cvref_t<BlockFmhaShape_>;
-
-  static constexpr ck::index_t kM0 = BlockFmhaShape::kM0;
-  static constexpr ck::index_t kN0 = BlockFmhaShape::kN0;
-  static constexpr ck::index_t kK0 = BlockFmhaShape::kK0;
-  static constexpr ck::index_t kN1 = BlockFmhaShape::kN1;
-  static constexpr ck::index_t kK1 = BlockFmhaShape::kK1;
-
-  __host__ static constexpr auto GridSize(
-      ck::index_t batch_size_,
-      ck::index_t nhead_,
-      ck::index_t seqlen_q_,
-      ck::index_t hdim_v_) {
-    // TODO: this may need tuning
-    return dim3(
-        ck::math::integer_divide_ceil(seqlen_q_, kM0) *
-            ck::math::integer_divide_ceil(hdim_v_, kN1),
-        nhead_,
-        batch_size_);
-  }
-
-  __device__ auto operator()(ck::index_t /*seqlen_q*/, ck::index_t hdim_v) {
-    using namespace ck;
-
-    // const index_t num_tile_m0 = seqlen_q / kM0;
-    const index_t num_tile_n1 = ck::math::integer_divide_ceil(hdim_v, kN1);
-
-    const index_t i_block = blockIdx.x;
-    const index_t i_nhead = blockIdx.y;
-    const index_t i_batch = blockIdx.z;
-
-    const auto f = [](index_t dividend, index_t divisor) {
-      index_t quotient = dividend / divisor;
-      index_t modulus = dividend - quotient * divisor;
-      return ck::make_tuple(quotient, modulus);
-    };
-
-    const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-    return ck::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
-  }
-};

From d8384c13270ed2fc0bd06fc55c5a8b10bdf81e57 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 22 Feb 2024 17:38:43 +0000
Subject: [PATCH 485/641] Replace using qs_ks_vs pipeline by qr_ks_vs pipeline
 while HeadDim is 256 for better performance

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h    | 3 +--
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h | 3 +--
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h    | 3 +--
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h | 3 +--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index ccbfd2d86..3dc0c4717 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -21,7 +21,6 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
@@ -105,7 +104,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
                   FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
               using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
               using FmhaKernel = FmhaFwdKernel<
                   FmhaTilePartitioner,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index af3ded107..8696e0437 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -21,7 +21,6 @@
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
@@ -105,7 +104,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                   FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
               using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
               using FmhaKernel = FmhaFwdKernel<
                   FmhaTilePartitioner,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index a79b3c1ef..ed0df2ba5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -21,7 +21,6 @@
 #include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
@@ -98,7 +97,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
                   FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
               using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
               using FmhaKernel = FmhaFwdKernel<
                   FmhaTilePartitioner,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 37be384c7..c371b0aa1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -21,7 +21,6 @@
 #include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qs_ks_vs.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
@@ -98,7 +97,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                   FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
               using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQSKSVS<
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
               using FmhaKernel = FmhaFwdKernel<
                   FmhaTilePartitioner,

From 10346dfc64aed5661c0b93aeddf5aed5f99c3266 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 22 Feb 2024 18:59:18 +0000
Subject: [PATCH 486/641] rm test_ck_7

---
 tests/test_ck_7.py | 875 ---------------------------------------------
 1 file changed, 875 deletions(-)
 delete mode 100644 tests/test_ck_7.py

diff --git a/tests/test_ck_7.py b/tests/test_ck_7.py
deleted file mode 100644
index 7477c3f70..000000000
--- a/tests/test_ck_7.py
+++ /dev/null
@@ -1,875 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar
-
-import pytest
-import torch
-
-import xformers.ops
-from xformers.ops import fmha
-from xformers.ops.fmha.common import AttentionOpBase
-
-from .utils import assert_allclose
-
-torch.backends.cuda.matmul.allow_tf32 = False
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-_types = [torch.float16, torch.bfloat16]
-
-T = TypeVar(
-    "T", Type[fmha.common.AttentionFwOpBase], Type[fmha.common.AttentionBwOpBase]
-)
-
-ALL_FW_OPS: Sequence[Type[fmha.common.AttentionFwOpBase]] = [
-    fmha.ck.FwOp,
-]
-
-ALL_BW_OPS: Sequence[Type[fmha.common.AttentionBwOpBase]] = [
-    fmha.ck.BwOp,
-]
-
-
-def sample_random_supported_fw(
-    inp: fmha.Inputs, seed: int
-) -> Type[fmha.common.AttentionFwOpBase]:
-    r = random.Random(seed)
-    fw_ops = list(ALL_FW_OPS)
-    r.shuffle(fw_ops)
-    for op in fw_ops:
-        if op.supports(inp):
-            return op
-    raise NotImplementedError(f"Could not find a FW operator for: {inp}")
-
-
-def generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-    shapes = []
-    for B in op._TEST_BATCH_SIZES:
-        for Mq in [32, 256]:
-            for Mkv in [32, 64, 256, 1024]:
-                for K in op._TEST_K:
-                    shapes.append((B, Mq, Mkv, 1, K, K))
-        Mq = 256
-        Mkv = 128
-        K = 32
-        H = 1
-        # Weird values of parameters
-        for M in [2, 3, 15, 31, 32, 34, 68, 72, 90, 132, 136]:
-            shapes.append((B, M, Mkv, H, K, K))
-            shapes.append((B, Mq, M, H, K, K))
-        for _K in [1, 2, 3, 31, 34, 36, 38, 40, 64, 80, 160, 256 + 2, 256 + 8, 512]:
-            if _K <= op.SUPPORTED_MAX_K:
-                shapes.append((B, Mq, Mkv, H, _K, _K))
-        # Different value for K / Kv
-        if op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-            for _K in [32, 36, 64, 256 + 8]:
-                shapes.append((B, Mq, Mkv, H, K, _K))
-                shapes.append((B, Mq, Mkv, H, _K, K))
-        # Exotic sizes
-        for K in op._TEST_K:
-            shapes.append((B, 16, 1024, H, K, K))
-            shapes.append((B, 1024, 16, H, K, K))
-        # Some number of heads
-        for H in [3, 5, 12]:
-            shapes.append((max(1, B // H), Mq, Mkv, H, K, K))
-    # Filter-out not supported shapes
-    shapes = [
-        shape
-        for shape in shapes
-        if len(
-            op.shape_not_supported_reasons(
-                Mq=shape[1], Mkv=shape[2], K=shape[4], Kv=shape[5]
-            )
-        )
-        == 0
-    ]
-    # Add some random shapes
-    if op in [
-        fmha.ck.FwOp,
-        fmha.ck.BwOp,
-    ]:
-        K_CHOICES = [8 * i for i in range(1, 256 // 8)]
-        r = random.Random(0)
-        found_count = 0
-        while found_count < 20:
-            B = r.randint(1, 400)
-            Mq = r.randint(1, 500)
-            Mkv = r.randint(1, 500)
-            H = r.randint(2, 11)
-            B = max(B // H, 1)
-            K = r.choice(K_CHOICES)
-            Kv = r.choice(K_CHOICES)
-            if not op.SUPPORTS_DIFFERENT_VALUE_EMBED:
-                Kv = K
-            if len(op.shape_not_supported_reasons(Mq, Mkv, K, Kv)):
-                continue
-            found_count += 1
-            shapes.append((B, Mq, Mkv, H, K, Kv))
-    return shapes
-
-
-def make_id(op, device, dtype, bias_type, *shape):
-    return (
-        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-        f"-{'-'.join([str(s) for s in shape])}"
-    )
-
-
-def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
-    ops_list: Sequence[Type[fmha.AttentionOpBase]], max_shapes_per_op: int = 65000
-):
-    r = random.Random(0)
-    combination = []
-    ids = []
-    for op in ops_list:
-        op_count = 0
-        # Sort list of masks, so it's deterministic across runs
-        LIST_MASKS = list(sorted(op.SUPPORTED_ATTN_BIAS_TYPES, key=lambda x: str(x)))
-        for shape in generate_test_shapes_B_Mq_Mkv_H_K_Kv(op):
-            has_one = False
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    bias_type = r.choice(LIST_MASKS)
-                    # Avoid using too much memory
-                    if bias_type not in [
-                        type(None),
-                        fmha.attn_bias.LowerTriangularMask,
-                    ]:
-                        B, Mq, Mkv, H, K, Kv = shape
-                        B = min(B, 12)
-
-                        if (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq) + 2
-                        elif (
-                            bias_type
-                            is fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask
-                        ):
-                            Mq, Mkv = min(Mkv, Mq), max(Mkv, Mq)
-                        shape = (B, Mq, Mkv, H, K, Kv)
-                    combination.append((op, device, dtype, bias_type, *shape))
-                    ids.append(
-                        f"{op.NAME}-{device}-{str(dtype)}-{bias_type.__name__}"
-                        f"-{'-'.join([str(s) for s in shape])}"
-                    )
-                    has_one = True
-            if has_one:
-                op_count += 1
-            if op_count > max_shapes_per_op:
-                break
-        # Some specific shapes for which we want to run without any mask
-        bias_type = type(None)
-        for shape in (
-            # Some strides/dims don't fit on an uint16
-            (1, 128, 128, 300, 128, 128),
-            (13, 1, 67, 200, 8, 8),
-            (1, 1 + 2**16, 4, 1, 8, 8),
-            (1, 4, 1 + 2**16, 1, 8, 8),
-            # TODO: Some strides don't fit on an uint32
-            # Crashes on Flash, Errors on Cutlass
-            # (1, 1, 64000, 300, 128, 128)
-        ):
-            for device in _devices:
-                if device not in op.SUPPORTED_DEVICES:
-                    continue
-                for dtype in op.SUPPORTED_DTYPES:
-                    combination.append((op, device, dtype, bias_type, *shape))
-    return {
-        "argvalues": combination,
-        "ids": [make_id(*c) for c in combination],
-    }
-
-
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS),
-)
-parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_FW_OPS, max_shapes_per_op=1),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS),
-)
-parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv__xs = pytest.mark.parametrize(
-    "opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv",
-    **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
-)
-
-
-def ref_attention(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    if q.ndim == 4:
-        assert p == 0.0
-        return ref_attention_bmhk(q, k, v, attn_bias=attn_bias)
-    q = q.float()
-    k = k.float()
-    v = v.float()
-
-    scale = scale if scale is not None else (1 / q.shape[-1] ** 0.5)
-    q = q * scale
-
-    attn = q @ k.transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-        attn = attn + attn_bias_tensor.float()
-    attn = attn.softmax(-1)
-    if drop_mask is not None:
-        attn = attn * (drop_mask / (1 - p))
-    return attn @ v
-
-
-def ref_attention_bmhk(q, k, v, attn_bias, scale=None) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention(T(q), T(k), T(v), attn_bias, scale=scale)
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def _rand_seqlens(
-    r: random.Random,
-    bs: int,
-    q_len: int,
-    kv_len: int,
-    more_keys_than_queries_per_block: bool,
-) -> Tuple[Sequence[int], Sequence[int]]:
-    """
-    Generates lists of lengths of query blocks and corresponding key blocks.
-    The total number of queries will be bs * q_len and the
-    total number of keys will be bs * kv_len.
-    """
-    if more_keys_than_queries_per_block:
-        assert kv_len >= q_len
-    q_len *= bs
-    kv_len *= bs
-    seqlens_q: List[int] = []
-    seqlens_k: List[int] = []
-
-    step_q = [max(1, q_len // 10), max(2, q_len // 2)]
-    step_k = [max(1, kv_len // 10), max(2, kv_len // 2)]
-    while sum(seqlens_q) < q_len and sum(seqlens_k) < kv_len:
-        num_queries = r.randrange(*step_q)
-        seqlens_q.append(num_queries)
-
-        if more_keys_than_queries_per_block:
-            # Must select at least `num_queries` keys
-            # But also leave enough keys for later
-            keys_left = kv_len - sum(seqlens_k, 0)
-            queries_left = q_len - sum(seqlens_q[:-1], 0)
-            assert keys_left >= queries_left
-            seqlens_k.append(num_queries + r.randrange(0, keys_left - queries_left))
-        else:
-            seqlens_k.append(r.randrange(*step_k))
-    seqlens_q[-1] = q_len - sum(seqlens_q[:-1])
-    seqlens_k[-1] = kv_len - sum(seqlens_k[:-1])
-    return seqlens_q, seqlens_k
-
-
-def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
-    # returns list of n nonnegative integers summing to total
-    idx = {0, total}
-    while len(idx) < n + 1:
-        idx.add(r.randint(1, total - 1))
-    s = sorted(idx)
-    return [e - b for b, e in zip(s[:-1], s[1:])]
-
-
-def _rand_maxed_partition(
-    r: random.Random, total: int, n: int, mx: int, positive: bool = True
-) -> List[int]:
-    # returns list of n nonnegative integers less than mx summing to total
-    # NB: This is unfortunately biased towards evenly-split bins.
-    # If `positive`, outputs are positive
-    if positive:
-        total -= n
-        mx -= 1
-    idxs = r.sample(range(n * mx), total)
-    y = torch.zeros(n, mx, dtype=torch.int32)
-    y.flatten()[idxs] = 1
-    z = y.sum(1)
-    if positive:
-        z += 1
-    return z.tolist()
-
-
-def _rand_seqlens_padded_k(
-    r: random.Random, bs: int, q_len: int, kv_len: int
-) -> Tuple[Sequence[int], Sequence[int]]:
-    # This is for BlockDiagonalCausalWithOffsetPaddedKeysMask.
-    # we need q_seqlens and k_seqlens to be of len bsz.
-    # For each "batch element" there must be more keys than queries
-    # because this bias type is "bottom right" and so any extra queries
-    # will attend to nothing and have undefined result.
-    # In addition every element of k_seqlens must be <= kv_len
-    if q_len > kv_len:
-        raise ValueError("need more keys than values")
-    if q_len == kv_len:
-        # all key slots are needed so we cannot have padding
-        q_seqlens = k_seqlens = [kv_len] * bs
-    else:
-        q_seqlens = _rand_maxed_partition(r, q_len * bs, bs, kv_len)
-        k_seqlens = [r.randint(i, kv_len) for i in q_seqlens]
-    return q_seqlens, k_seqlens
-
-
-def _create_aligned_bias(B: int, H: int, Mq: int, Mkv: int, **kwargs) -> torch.Tensor:
-    align_to = 8
-    return (
-        torch.randn(
-            (
-                B,
-                H,
-                Mq,
-                align_to * ((Mkv + align_to - 1) // align_to),
-            ),
-            **kwargs,
-        )
-        * 3
-    )[:, :, :, :Mkv]
-
-
-def create_attn_bias(
-    bias_type,
-    batch_size: int,
-    num_heads: int,
-    q_len: int,
-    kv_len: int,
-    device,
-    dtype,
-    requires_grad: bool,
-    fmt: str,
-    op: Type[AttentionOpBase],
-):
-    if bias_type is None or isinstance(None, bias_type):
-        return None
-    r = random.Random("-".join(map(str, [batch_size, q_len, kv_len, dtype, fmt])))
-    if bias_type is torch.Tensor:
-        if fmt == "BMK":
-            batch_size *= num_heads
-            num_heads = 1
-        # `small_k` only supports an expanded 1d bias
-        if op in [fmha.small_k.FwOp, fmha.small_k.BwOp]:
-            attn_bias = (
-                torch.randn(
-                    (batch_size, num_heads, 1, kv_len), device=device, dtype=dtype
-                )
-                * 3
-            )
-            attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        else:
-            attn_bias = _create_aligned_bias(
-                batch_size,
-                num_heads,
-                q_len,
-                kv_len,
-                device=device,
-                dtype=dtype,
-            )
-            # ToDo: need a fix in ck-flashAttn to avoid divided-by-zero when all-(-inf) occurred
-            #       with the data read by one-thread
-            # make sure it also works if the first columns are partially masked out
-            #
-            # attn_bias[0, 0, q_len - 1 :, : num_heads - 2] = -math.inf
-
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        if fmt == "BMK":
-            attn_bias = attn_bias[:, 0]
-        return attn_bias
-    if bias_type is fmha.attn_bias.LowerTriangularMask:
-        return fmha.attn_bias.LowerTriangularMask()
-    if bias_type is fmha.attn_bias.LowerTriangularMaskWithTensorBias:
-        attn_bias = _create_aligned_bias(
-            batch_size,
-            num_heads,
-            q_len,
-            kv_len,
-            device=device,
-            dtype=dtype,
-        )
-        if requires_grad:
-            attn_bias.requires_grad_(True)
-        return fmha.attn_bias.LowerTriangularMaskWithTensorBias(attn_bias)
-    if bias_type in [
-        fmha.attn_bias.BlockDiagonalMask,
-        fmha.attn_bias.BlockDiagonalCausalMask,
-        fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-    ]:
-        # This bias is not supported in BMK format
-        assert fmt == "BMHK"
-        block_diag = fmha.attn_bias.BlockDiagonalMask.from_seqlens(
-            *_rand_seqlens(
-                r,
-                batch_size,
-                q_len,
-                kv_len,
-                more_keys_than_queries_per_block=bias_type
-                is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask,
-            )
-        )
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalMask:
-            block_diag = block_diag.make_causal()
-        if bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask:
-            block_diag = block_diag.make_causal_from_bottomright()
-        return block_diag
-    if bias_type == fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask:
-        assert fmt == "BMHK"
-        q, k = _rand_seqlens_padded_k(r, batch_size, q_len, kv_len)
-        g_block_diag = (
-            fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=q,
-                kv_padding=kv_len,
-                kv_seqlen=k,
-            )
-        )
-        return g_block_diag
-
-    assert False, f"Unsupported bias type: {bias_type}"
-
-
-def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
-    tensor_with_grad: Optional[torch.Tensor] = None
-    if isinstance(attn_bias, torch.Tensor):
-        tensor_with_grad = attn_bias
-    if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-        tensor_with_grad = attn_bias._bias
-    if tensor_with_grad is not None:
-        grad = tensor_with_grad.grad
-        if clear:
-            tensor_with_grad.grad = None
-        return grad
-    return None
-
-
-def create_tensors(
-    op: Type[AttentionOpBase],
-    device,
-    dtype,
-    attn_bias_type,
-    B,
-    q_len,
-    kv_len,
-    h,
-    k,
-    kv,
-    *,
-    attn_bias_requires_grad: bool = False,
-    fmt: str = "BMK",
-):
-    torch.manual_seed(B * q_len + kv_len * k + kv)
-    scale = 3
-    if fmt == "BMK":
-        query = torch.randn((B * h, q_len, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B * h, kv_len, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B * h, kv_len, kv), device=device, dtype=dtype).mul_(scale)
-    else:
-        assert fmt == "BMHK"
-        query = torch.randn((B, q_len, h, k), device=device, dtype=dtype).mul_(scale)
-        key = torch.randn((B, kv_len, h, k), device=device, dtype=dtype).mul_(scale)
-        value = torch.randn((B, kv_len, h, kv), device=device, dtype=dtype).mul_(scale)
-
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(attn_bias_type):
-        attn_bias_type = None
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=h,
-            q_len=q_len,
-            kv_len=kv_len,
-            dtype=dtype,
-            device=device,
-            requires_grad=attn_bias_requires_grad,
-            fmt=fmt,
-            op=op,
-        )
-        if isinstance(
-            attn_bias,
-            (
-                fmha.attn_bias.BlockDiagonalMask,
-                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
-            ),
-        ):
-            query, key, value = [
-                x.reshape([1, -1, *x.shape[2:]]) for x in [query, key, value]
-            ]
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
-    return query, key, value, attn_bias
-
-
-def bmhk2bmk(tensor) -> torch.Tensor:
-    return (
-        tensor.permute((0, 2, 1, 3))
-        .contiguous()
-        .view([tensor.shape[0] * tensor.shape[2], tensor.shape[1], tensor.shape[3]])
-    )
-
-
-def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
-    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
-        (0, 2, 1, 3)
-    )
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("packed", [False, True])
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_forward(
-    opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    packed,
-    fmt,
-):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if kv > 128:
-        pytest.skip("kv > 128 is not supported by CK-FlashAttention-1")
-
-    if packed and not (k == kv and q_len == kv_len):
-        pytest.skip(
-            f"packed incompatible with `k ({k}) != kv ({kv})` or `q_len ({q_len}) != kv_len ({kv_len})`"
-        )
-    if fmt == "BMK" and not fmha.common._is_bias_type_supported_in_BMK(bias_type):
-        pytest.skip("BMK incompatible with this bias")
-
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMHK" if packed else fmt
-    )
-
-    if packed:
-        c = torch.stack([query, key, value], 2)
-        if fmt == "BMK":
-            # bm3hk -> 3bhmk -> 3Bmk
-            c = c.permute(2, 0, 3, 1, 4).view([3, -1, q_len, k])
-            query, key, value = c[0], c[1], c[2]
-            # Re-create bias in the right format
-            attn_bias = create_attn_bias(
-                bias_type=bias_type,
-                batch_size=batch_size,
-                num_heads=h,
-                q_len=q_len,
-                kv_len=kv_len,
-                device=device,
-                dtype=dtype,
-                requires_grad=False,
-                fmt=fmt,
-                op=op,
-            )
-        else:
-            # bm3hk -> 3 x bmhk
-            query, key, value = xformers.ops.unbind(c, 2)
-        assert not query.is_contiguous()
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
-@pytest.mark.parametrize("k_len", [5, 6, 32])
-@pytest.mark.parametrize("batch_size", [1, 4])
-@pytest.mark.parametrize("kv_len", [128, 512])
-@pytest.mark.parametrize("q_len", [128, 512])
-@pytest.mark.parametrize("device", [torch.device("cuda")])
-@pytest.mark.parametrize("dtype", _types)
-def test_key_query_all_ones(dtype, device, q_len, kv_len, batch_size, k_len):
-    scale = 3
-    query = torch.ones((batch_size, q_len, k_len), device=device, dtype=dtype)
-    key = torch.ones((batch_size, kv_len, k_len), device=device, dtype=dtype)
-    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, op=(fmha.ck.FwOp, None)
-    )
-    # this should be equivalent to the average over value
-    ref = value.mean(1, keepdim=True).expand_as(query)
-
-    if dtype is torch.float16:
-        assert_allclose(out, ref, atol=1e-5)
-    else:
-        assert_allclose(out, ref, atol=1e-2)
-
-
-def _block_diag_reshape_lse(
-    lse: torch.Tensor, q_seqinfo: fmha.attn_bias._SeqLenInfo
-) -> torch.Tensor:
-    """LSE can be padded, let's remove the padding"""
-    parts = []
-    for slice, (start, end) in zip(lse.unbind(0), q_seqinfo.intervals()):
-        parts.append(slice[:, : end - start])
-    return torch.cat(parts, dim=1).unsqueeze(1)
-
-
-@parametrize_opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_logsumexp(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv):
-    (
-        op,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-    query, key, value, attn_bias = create_tensors(
-        *opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, fmt="BMK"
-    )
-
-    _out, lse = xformers.ops.memory_efficient_attention_forward_requires_grad(
-        query,
-        key,
-        value,
-        op=op,
-        attn_bias=attn_bias,
-    )
-    attn = (query.float() / k**0.5) @ key.float().transpose(-2, -1)
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            tensor_bias = attn_bias.materialize(
-                (query.shape[0], 1, query.shape[1], key.shape[1]),
-                device=query.device,
-                dtype=torch.float32,
-            )
-        else:
-            assert isinstance(attn_bias, torch.Tensor)
-            tensor_bias = attn_bias
-        if tensor_bias.ndim == 4:
-            tensor_bias = tensor_bias.reshape([-1, *tensor_bias.shape[2:]])
-        attn = attn + tensor_bias.float()
-    ref_lse = attn.logsumexp(-1)
-    if isinstance(attn_bias, fmha.attn_bias.BlockDiagonalMask):
-        lse = _block_diag_reshape_lse(lse, attn_bias.q_seqinfo)
-    assert_allclose(lse[:, 0, : ref_lse.shape[1]], ref_lse, atol=2e-4)
-
-
-@pytest.mark.parametrize("fmt", ["BMK", "BMHK"])
-@pytest.mark.parametrize("grad_out_contiguous", [True])
-@parametrize_opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-def test_backward(
-    opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-    grad_out_contiguous,
-    fmt,
-):
-    (
-        op_bw,
-        device,
-        dtype,
-        bias_type,
-        batch_size,
-        q_len,
-        kv_len,
-        h,
-        k,
-        kv,
-    ) = opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv
-
-    if k > 128 or kv > 128:
-        pytest.skip(
-            "head-dim length bigger than 128 is not supported by CK-FlashAttention-1"
-        )
-
-    if k % 8 != 0 or kv % 8 != 0:
-        pytest.skip("head-dim length must be an even value for CK-FlashAttention-1")
-
-    # BottomRightMask requires generate {m0,m1,...}, {n0,n1,...} where mi <= ni
-    if (
-        bias_type is fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask
-        and q_len <= kv_len
-    ):
-        pytest.skip(
-            "BlockDiagonalCausalFromBottomRightMask requires kv_len bigger than q_len"
-        )
-
-    if k != kv:
-        pytest.skip("k same as kv is not well tested by CK-FlashAttention-1")
-
-    # attn_bias_requires_grad = (
-    #    random.Random(q_len + kv_len * batch_size).randint(0, 1) > 0
-    # )
-    attn_bias_requires_grad = False
-
-    query, key, value, attn_bias = create_tensors(
-        *opBW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv,
-        attn_bias_requires_grad=attn_bias_requires_grad,
-        fmt=fmt,
-    )
-    op_fw = (
-        sample_random_supported_fw(
-            fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias),
-            seed=q_len * kv + kv_len * k,
-        )
-        if op_bw != fmha.ck.BwOp
-        else fmha.ck.FwOp
-    )
-    qkv = None
-
-    if (
-        fmt == "BMHK"
-        and query.shape[3] == value.shape[3]
-        and query.shape[1] == value.shape[1]
-    ):
-        qkv = torch.stack([query, key, value], 2)
-        qkv.requires_grad_(True)
-        # bm3hk -> 3 x bmhk
-        query, key, value = xformers.ops.unbind(qkv, 2)
-        assert not query.is_contiguous()
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    if not op_bw.supports(fmha.Inputs(query, key, value, attn_bias)):
-        pytest.skip("inputs not supported")
-
-    out = xformers.ops.memory_efficient_attention(
-        query, key, value, attn_bias, op=(op_fw, op_bw)
-    )
-
-    grad_out = torch.ones_like(out)
-    # if grad_out_contiguous is False:
-    #    grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
-    #        None, None, :
-    #    ].expand_as(out)
-
-    out.backward(grad_out)
-
-    if qkv is None and op_bw == fmha.ck.BwOp:
-        assert query.stride() == query.grad.stride()
-
-    grads = []
-    if qkv is None:
-        grads = [query.grad, key.grad, value.grad]
-        query.grad = None
-        key.grad = None
-        value.grad = None
-    else:
-        grads = [qkv.grad]
-        qkv.grad = None
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias, clear=True)
-        if attn_bias_grad is not None:
-            grads.append(attn_bias_grad)
-
-    ref = ref_attention(query, key, value, attn_bias)
-    ref.backward(grad_out)
-
-    assert_allclose(
-        out.float(),
-        ref.float(),
-        "fw pass",
-        atol=op_fw.ERROR_ATOL[dtype],
-        rtol=op_fw.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-    del out
-    del grad_out
-    del ref
-
-    atol = op_bw.ERROR_ATOL[dtype]
-    rtol = op_bw.ERROR_RTOL[dtype]
-
-    grads_ref = []
-    grads_name = []
-    if qkv is None:
-        assert isinstance(query.grad, torch.Tensor)
-        assert isinstance(key.grad, torch.Tensor)
-        assert isinstance(value.grad, torch.Tensor)
-        grads_ref = [query.grad, key.grad, value.grad]
-        grads_name = ["query", "key", "value"]
-    else:
-        assert isinstance(qkv.grad, torch.Tensor)
-        grads_ref = [qkv.grad]
-        grads_name = ["qkv"]
-
-    if attn_bias_requires_grad:
-        attn_bias_grad = get_bias_grad(attn_bias)
-        if attn_bias_grad is not None:
-            grads_ref.append(attn_bias.grad)
-            grads_name.append("bias")
-
-    del query
-    del key
-    del value
-    del qkv
-
-    assert len(grads_ref) == len(
-        grads
-    ), "Wrong number of gradients (maybe bias grad didn't backprop?)"
-    for name, calc_grad, ref_grad in zip(grads_name, grads, grads_ref):
-        assert_allclose(
-            calc_grad,
-            ref_grad,
-            msg=f"{op_fw.NAME}+{op_bw.NAME}:{name}",
-            atol=atol,
-            rtol=rtol,
-        )

From 08b4159d666e43f54fd42c223dea7722aa057b5e Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 12 Mar 2024 23:38:45 +0000
Subject: [PATCH 487/641] dump kernel resource usage to compilation logs
 similar to nv

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 163344bb5..e909188c8 100644
--- a/setup.py
+++ b/setup.py
@@ -377,6 +377,7 @@ def get_extensions():
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-DCK_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
+                "-Rpass-analysis=kernel-resource-usage",
             ]
             + generator_flag
             + cc_flag,

From 2da292719fd301a5bd57df074c78e64ef189d597 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 20 Mar 2024 21:50:59 +0000
Subject: [PATCH 488/641] Add the c++ extension to the latest change of
 ck_tile/dev fwd kernel (added droppout)

---
 .gitmodules                                   |  2 +-
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 41 ++++++++++++------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 41 ++++++++++++------
 ...initions.h => ck_tiled_fmha_fwd_setting.h} | 10 ++---
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 43 +++++++++++--------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 38 ++++++++++------
 7 files changed, 109 insertions(+), 68 deletions(-)
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_definitions.h => ck_tiled_fmha_fwd_setting.h} (95%)

diff --git a/.gitmodules b/.gitmodules
index 635811410..7b6cfaab8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,5 +6,5 @@
 	url = https://github.com/Dao-AILab/flash-attention.git
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
-	url = https://github.com/ROCm/composable_kernel.git
+	url = https://github.com/ROCm/composable_kernel-internal.git
 	branch = ck_tile/dev
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index b34434327..0e533488d 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit b344343273cf6731ba0a47e061629890a8014af5
+Subproject commit 0e533488daa13cceb4c61dfa150aad9fd895fa63
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 3dc0c4717..61cdcd124 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -6,10 +6,6 @@
  */
 #pragma once
 
-#include <optional>
-#include <sstream>
-#include <stdexcept>
-
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
 #include <ck/tensor/tensor_view.hpp>
@@ -24,15 +20,16 @@
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
-#include "ck_tiled_fmha_definitions.h"
-#include "ck_tiled_fmha_forward_kernel.h"
-#include "ck_tiled_fmha_fwd_epilogue.h"
-#include "ck_tiled_fmha_fwd_tile_partitioner.h"
-#include "ck_tiled_fmha_params.h"
-
 #include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_fwd_setting.h"
+#include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
+#include "ck_tiled_fmha_definitions.hpp"
+#include "ck_tiled_fmha_forward_kernel.hpp"
+#include "ck_tiled_fmha_fwd_epilogue.hpp"
+#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+
 template <
     typename scalar_t,
     bool has_causal_mask,
@@ -52,6 +49,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
           typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -98,6 +96,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   true, // kStoreLSE
+                  false, // kHadDropout, to be changed
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -131,6 +130,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   true, // kStoreLSE
+                  false, // kHadDropout, to be changed
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -173,33 +173,46 @@ struct batched_forward_causalmask_attnbias_dispatched {
           param.k_ptr,
           param.v_ptr,
           param.attn_bias_ptr,
+          nullptr, // rand_val_ptr
           param.logsumexp_ptr,
           param.out_ptr,
           param.M, // seqlen_q
           param.N, // seqlen_k
           param.K, // hdim_q
           param.Kv, // hdim_v
+          param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
-          param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
+          param.q_strides[1], // q, k, v, bias, randval, out tensor seq-dim
+                              // stride
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[2],
+          0, // stride_randval
           param.out_strides[1],
-          param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
+          param.q_strides[2], // q, k, v, bias, randval, lse, out tensor
+                              // head-dim stride
           param.k_strides[2],
           param.v_strides[2],
           param.attn_bias_strides[1],
+          0, // nhead_randval
           param.M, // nhead_stride_lse
           param.out_strides[2],
-          param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
+          param.q_strides[0], // q, k, v, bias, randval, lse, out tensor
+                              // batch-dim stride
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[0],
+          0, // batch_stride_randval
           param.Hq * param.M, // batch_stride_lse
           param.out_strides[0],
           static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size);
+          param.window_size,
+          1.0f, // descale_qk, not used
+          1.0f, // descale_sv, not used
+          param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+          false, // is_store_randval
+          {param.philox_seed, param.philox_offset});
     }();
 
     dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 8696e0437..4e9286a75 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -6,10 +6,6 @@
  */
 #pragma once
 
-#include <optional>
-#include <sstream>
-#include <stdexcept>
-
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
 #include <ck/tensor/tensor_view.hpp>
@@ -24,15 +20,16 @@
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
-#include "ck_tiled_fmha_definitions.h"
-#include "ck_tiled_fmha_forward_kernel.h"
-#include "ck_tiled_fmha_fwd_epilogue.h"
-#include "ck_tiled_fmha_fwd_tile_partitioner.h"
-#include "ck_tiled_fmha_params.h"
-
 #include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_fwd_setting.h"
+#include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
+#include "ck_tiled_fmha_definitions.hpp"
+#include "ck_tiled_fmha_forward_kernel.hpp"
+#include "ck_tiled_fmha_fwd_epilogue.hpp"
+#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+
 template <
     typename scalar_t,
     bool has_causal_mask,
@@ -52,6 +49,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
           typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -98,6 +96,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   false, // kStoreLSE
+                  false, // kHasDropout
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -131,6 +130,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   false, // kStoreLSE
+                  false, // kHasDropout
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -173,33 +173,46 @@ struct batched_infer_causalmask_attnbias_dispatched {
           param.k_ptr,
           param.v_ptr,
           param.attn_bias_ptr,
+          nullptr, // rand_val_ptr
           nullptr, // lse_ptr
           param.out_ptr,
           param.M, // seqlen_q
           param.N, // seqlen_k
           param.K, // hdim_q
           param.Kv, // hdim_v
+          param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
-          param.q_strides[1], // q, k, v, bias, out tensor seq-dim stride
+          param.q_strides[1], // q, k, v, bias, randval, out tensor seq-dim
+                              // stride
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[2],
+          0, // stride_randval
           param.out_strides[1],
-          param.q_strides[2], // q, k, v, bias, lse, out tensor head-dim stride
+          param.q_strides[2], // q, k, v, bias, randval, lse, out tensor
+                              // head-dim stride
           param.k_strides[2],
           param.v_strides[2],
           param.attn_bias_strides[1],
+          0, // nhead_stride_randval
           0, // nhead_stride_lse
           param.out_strides[2],
-          param.q_strides[0], // q, k, v, bias, lse, out tensor batch-dim stride
+          param.q_strides[0], // q, k, v, bias, randval, lse, out tensor
+                              // batch-dim stride
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[0],
+          0, // batch_stride_randval
           0, // batch_stride_lse
           param.out_strides[0],
           static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size);
+          param.window_size,
+          1.0f, // descale_qk, not used
+          1.0f, // descale_sv, not used
+          0.0f, // p_dropout
+          false, // is_store_randval
+          {0, 0});
     }();
 
     dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
similarity index 95%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
index 4e3767fd2..3810bd3d0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_definitions.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,6 @@
 
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
-enum struct CausalMaskType {
-  MaskDisabled,
-  MaskUpperTriangleFromTopLeft,
-  MaskUpperTriangleFromBottomRight
-};
-
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 
@@ -23,6 +17,7 @@ struct FmhaFwdTypeConfig<ck::half_t> {
   using KDataType = ck::half_t;
   using VDataType = ck::half_t;
   using BiasDataType = ck::half_t;
+  using RandValOutputDataType = unsigned short;
   using LSEDataType =
       float; // data type for lse(logsumexp L_j = max_j + log(l_j))
   using SaccDataType = float; // data type for first gemm accumulation
@@ -38,6 +33,7 @@ struct FmhaFwdTypeConfig<ck::bhalf_t> {
   using KDataType = ck::bhalf_t;
   using VDataType = ck::bhalf_t;
   using BiasDataType = ck::bhalf_t;
+  using RandValOutputDataType = unsigned short;
   using LSEDataType =
       float; // data type for lse(logsumexp L_j = max_j + log(l_j))
   using SaccDataType = float; // data type for first gemm accumulation
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index bb4d43d5f..78ed74316 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -6,11 +6,6 @@
  */
 #pragma once
 
-#include <iostream>
-#include <optional>
-#include <sstream>
-#include <stdexcept>
-
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
 #include <ck/tensor/tensor_view.hpp>
@@ -24,15 +19,16 @@
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
-#include "ck_tiled_fmha_definitions.h"
-#include "ck_tiled_fmha_forward_kernel.h"
-#include "ck_tiled_fmha_fwd_epilogue.h"
-#include "ck_tiled_fmha_fwd_tile_partitioner.h"
-#include "ck_tiled_fmha_params.h"
-
 #include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_fwd_setting.h"
+#include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
+#include "ck_tiled_fmha_definitions.hpp"
+#include "ck_tiled_fmha_forward_kernel.hpp"
+#include "ck_tiled_fmha_fwd_epilogue.hpp"
+#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+
 template <
     typename scalar_t,
     bool has_causal_mask,
@@ -52,6 +48,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
           typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -72,9 +69,8 @@ struct grouped_forward_causalmask_attnbias_dispatched {
 
       using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-      constexpr ck::index_t occupancy = (MaxK == 64) ? 3
-          : (MaxK == 256)                            ? 1
-                                                     : 2;
+      constexpr ck::index_t occupancy =
+          (MaxK == 64) ? 3 : (MaxK == 256) ? 1 : 2;
 
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
@@ -92,6 +88,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   true, // kStoreLSE
+                  false, // kHadDropout, to be changed
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -117,6 +114,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   true, // kStoreLSE
+                  false, // kHasDropout
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -144,6 +142,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           param.k_ptr,
           param.v_ptr,
           param.attn_bias_ptr,
+          nullptr, // rand_val_ptr
           param.logsumexp_ptr,
           param.out_ptr,
           param.seqstart_q_dev_ptr,
@@ -151,21 +150,31 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           param.seqlen_k_dev_ptr,
           param.K, // hdim_q
           param.Kv, // hdim_v
+          param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
-          param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
+          param.q_strides[0], // q, k, v, bias, randval, out tensor seq-dim
+                              // stride
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[2],
+          0, // stride_randval
           param.out_strides[0],
-          param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
+          param.q_strides[1], // q, k, v, bias, randval, lse, out tensor
+                              // head-dim stride
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[1],
+          0, // nhead_stride_randval
           param.max_seqlen_q, // nhead_stride_lse
           param.out_strides[1],
           static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size);
+          param.window_size,
+          1.0f, // descale_qk, not used
+          1.0f, // descale_sv, not used
+          param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+          false, // is_store_randval
+          {param.philox_seed, param.philox_offset});
     }();
 
     dim3 kGridSize = FmhaKernel::GridSize(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index c371b0aa1..05975f84f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -6,11 +6,6 @@
  */
 #pragma once
 
-#include <iostream>
-#include <optional>
-#include <sstream>
-#include <stdexcept>
-
 #include <ck/host_utility/device_prop.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
 #include <ck/tensor/tensor_view.hpp>
@@ -24,15 +19,16 @@
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
-#include "ck_tiled_fmha_definitions.h"
-#include "ck_tiled_fmha_forward_kernel.h"
-#include "ck_tiled_fmha_fwd_epilogue.h"
-#include "ck_tiled_fmha_fwd_tile_partitioner.h"
-#include "ck_tiled_fmha_params.h"
-
 #include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_fwd_setting.h"
+#include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
+#include "ck_tiled_fmha_definitions.hpp"
+#include "ck_tiled_fmha_forward_kernel.hpp"
+#include "ck_tiled_fmha_fwd_epilogue.hpp"
+#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+
 template <
     typename scalar_t,
     bool has_causal_mask,
@@ -52,6 +48,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
           typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
           typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
           typename FmhaFwdTypeConfig<scalar_t>::PDataType,
           typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
@@ -91,6 +88,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   false, // kStoreLSE
+                  false, // kHasDropout
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -116,6 +114,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                   kPadHeadDimV,
                   has_attn_bias,
                   false, // kStoreLSE
+                  false, // kHasDropout
                   occupancy>;
 
               using FmhaPipelineProblem =
@@ -143,6 +142,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           param.k_ptr,
           param.v_ptr,
           param.attn_bias_ptr,
+          nullptr, // rand_val_ptr
           nullptr, // lse_ptr
           param.out_ptr,
           param.seqstart_q_dev_ptr,
@@ -150,21 +150,31 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           param.seqlen_k_dev_ptr,
           param.K, // hdim_q
           param.Kv, // hdim_v
+          param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
-          param.q_strides[0], // q, k, v, bias, out tensor seq-dim stride
+          param.q_strides[0], // q, k, v, bias, randval, out tensor seq-dim
+                              // stride
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[2],
+          0, // stride_randval
           param.out_strides[0],
-          param.q_strides[1], // q, k, v, bias, lse, out tensor head-dim stride
+          param.q_strides[1], // q, k, v, bias, randval, lse, out tensor
+                              // head-dim stride
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[1],
+          0, // nhead_stride_randval
           0, // nhead_stride_lse
           param.out_strides[1],
           static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size);
+          param.window_size,
+          1.0f, // descale_qk, not used
+          1.0f, // descale_sv, not used
+          0.0f, // p_dropout
+          false, // is_store_randval
+          {0, 0});
     }();
 
     dim3 kGridSize = FmhaKernel::GridSize(

From 9189e453bb9bdbd923157b2ff4dcbe861791f1e5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 27 Mar 2024 00:01:02 +0000
Subject: [PATCH 489/641] Add the c++ extension to use ck_tile/dev/ fmha bwd
 kernel

---
 .../attention_backward_generic_ck_tiled.cpp   | 520 ++++++++++++++++++
 .../hip_fmha/attention_forward_decoder.cpp    |   6 +-
 .../attention_forward_generic_ck_tiled.cpp    |  39 +-
 .../hip_fmha/attention_forward_splitk.cpp     |  54 +-
 .../hip_fmha/ck_attention_forward_decoder.h   |  10 +-
 .../ck_attention_forward_decoder_splitk.h     |  48 +-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 293 ++++++++++
 .../ck_tiled_fmha_batched_backward_bp16.cpp   |  63 +++
 .../ck_tiled_fmha_batched_backward_fp16.cpp   |  63 +++
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 113 ++--
 .../ck_tiled_fmha_batched_forward_bp16.cpp    |   1 +
 .../ck_tiled_fmha_batched_forward_fp16.cpp    |   1 +
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h      | 139 +++++
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 268 +++++++++
 .../ck_tiled_fmha_grouped_backward_bp16.cpp   |  63 +++
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   |  63 +++
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  84 +--
 .../ck_tiled_fmha_grouped_forward_bp16.cpp    |   1 +
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    |   1 +
 .../attention/hip_fmha/ck_tiled_fmha_params.h |  65 ++-
 .../hip_fmha/ck_tiled_headdim_switch.h        |  16 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...6_no_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...with_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...6_no_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...with_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...6_no_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...with_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...6_no_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ...16_no_causalmask_with_attnbias_maxk_64.cpp |  15 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  15 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  15 +
 ...with_causalmask_with_attnbias_maxk_128.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_32.cpp |  15 +
 ..._with_causalmask_with_attnbias_maxk_64.cpp |  15 +
 69 files changed, 2435 insertions(+), 196 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
new file mode 100644
index 000000000..8f93269c6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+#include <cstdlib>
+
+#include <ATen/Context.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorOperators.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "ck_fmha_util.h"
+#include "ck_tiled_fmha_params.h"
+
+extern void batched_backward_fp16(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+extern void batched_backward_bp16(
+    BatchedBackwardParams& param,
+    hipStream_t stream);
+extern void grouped_backward_fp16(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
+extern void grouped_backward_bp16(
+    GroupedBackwardParams& param,
+    hipStream_t stream);
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+efficient_attention_backward_ck(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const c10::optional<at::Tensor>& bias, // additive attention bias
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& seqstart_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& seqstart_k,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_k_,
+    const c10::optional<at::Tensor>& seqlen_k,
+    const at::Tensor& logsumexp,
+    const at::Tensor& out,
+    double dropout_p, // dropout probability
+    int64_t rng_seed, // seed using for generating random numbers for dropout
+    int64_t rng_offset, // offset into random number sequence
+    int64_t custom_mask_type,
+    const c10::optional<double> scale,
+    const c10::optional<int64_t> window_size) {
+  // ndim
+  TORCH_CHECK(query.dim() == grad_out.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == value.dim());
+  TORCH_CHECK(query.dim() == 4);
+
+  // batch size
+  TORCH_CHECK(query.size(0) == grad_out.size(0));
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // seqlen
+  TORCH_CHECK(key.size(1) == value.size(1));
+  TORCH_CHECK(query.size(1) == grad_out.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) % key.size(2) == 0);
+  TORCH_CHECK(key.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) == grad_out.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+  TORCH_CHECK(value.size(3) == grad_out.size(3));
+
+  // CK-FlashAttn requires out, grad_out to have same shapes
+  TORCH_CHECK(out.sizes() == grad_out.sizes());
+
+  // last dim is contiguous, device is CUDA
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(out);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  // logsumexp should be completely contiguous
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(logsumexp);
+
+  TORCH_CHECK(seqstart_q.has_value() == seqstart_k.has_value());
+  TORCH_CHECK(
+      !(seqstart_q.has_value() && bias.has_value()),
+      "seqstart_q + bias not supported");
+
+  if (seqstart_q.has_value()) {
+    TORCH_CHECK(seqstart_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(seqstart_q->dim() == 1 && seqstart_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
+    TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "seqstart_q only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+    TORCH_CHECK(max_seqlen_k_.has_value());
+  }
+
+  // at::cuda::CUDAGuard device_guard(query.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t Hq = query.size(2);
+  int64_t Hkv = key.size(2);
+  int64_t K = query.size(3);
+  int64_t Kv = value.size(3);
+
+  auto opts = query.options();
+
+  at::Tensor grad_q, grad_k, grad_v, grad_bias;
+
+  if (query.size(1) == key.size(1) && query.size(3) == value.size(3) &&
+      query.size(2) == key.size(2) &&
+      query.storage().is_alias_of(key.storage()) &&
+      query.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk for grad_q, grad_k, grad_v
+    // This is because q, k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk = at::empty({B, M, 3, Hq, K}, opts);
+    grad_q = chunk.select(2, 0);
+    grad_k = chunk.select(2, 1);
+    grad_v = chunk.select(2, 2);
+    grad_q.fill_(0);
+  } else if (
+      key.size(3) == value.size(3) &&
+      key.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk for grad_k, grad_v
+    // This is because k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk = at::empty({B, N, 2, Hkv, Kv}, opts);
+    grad_k = chunk.select(2, 0);
+    grad_v = chunk.select(2, 1);
+
+    grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+    grad_q.fill_(0);
+  } else {
+    grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
+    grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
+    grad_v = at::empty_strided(value.sizes(), value.strides(), value.options());
+    grad_q.fill_(0);
+  }
+
+  // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
+  TORCH_CHECK(query.sizes() == grad_q.sizes());
+  TORCH_CHECK(query.strides() == grad_q.strides());
+  TORCH_CHECK(key.sizes() == grad_k.sizes());
+  TORCH_CHECK(key.strides() == grad_k.strides());
+  TORCH_CHECK(value.sizes() == grad_v.sizes());
+  TORCH_CHECK(value.strides() == grad_v.strides());
+
+  const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
+
+  // even it is an output, the grad_bias is required to use the same data-type
+  // as bias in CK-FlashAttn
+  if (bias_requires_grad)
+    grad_bias =
+        at::empty_strided(bias->sizes(), bias->strides(), bias->options());
+
+  bool is_mqa_gqa = (Hq > Hkv);
+
+  at::Tensor tmp_grad_k, tmp_grad_v;
+
+  if (is_mqa_gqa) {
+    // allocate tmp_grad_k/tmp_grad_v which will be reduce to
+    // grad_k/grad_v for returning
+    tmp_grad_k = at::empty({B, N, Hq, K}, opts);
+    tmp_grad_v = at::empty({B, N, Hq, Kv}, opts);
+  }
+
+  auto dot_out = at::empty_like(logsumexp);
+
+  auto set_batched_backward_params = [&](BatchedBackwardParams& p) {
+    p.B = B;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    p.is_mqa_gqa = is_mqa_gqa;
+
+    TORCH_CHECK(p.B == logsumexp.size(0));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
+    TORCH_CHECK(p.M == logsumexp.size(2));
+
+    if (scale.has_value()) {
+      p.scale = float(*scale);
+    } else {
+      p.scale = float(1.0 / std::sqrt(float(K)));
+    }
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+    p.grad_out_ptr = grad_out.data_ptr();
+    p.out_ptr = out.data_ptr();
+
+    p.grad_q_ptr = grad_q.data_ptr();
+    p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
+    p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
+
+    p.q_strides = {
+        static_cast<int>(query.stride(0)),
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(0)),
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(0)),
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(0)),
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+    p.grad_out_strides = {
+        static_cast<int>(grad_out.stride(0)),
+        static_cast<int>(grad_out.stride(1)),
+        static_cast<int>(grad_out.stride(2)),
+        static_cast<int>(grad_out.stride(3))};
+
+    p.lsed_strides = {
+        static_cast<int>(logsumexp.stride(0)),
+        static_cast<int>(logsumexp.stride(1)),
+        static_cast<int>(logsumexp.stride(2))};
+
+    if (is_mqa_gqa) {
+      p.grad_k_strides = {
+          static_cast<int>(tmp_grad_k.stride(0)),
+          static_cast<int>(tmp_grad_k.stride(1)),
+          static_cast<int>(tmp_grad_k.stride(2)),
+          static_cast<int>(tmp_grad_k.stride(3))};
+      p.grad_v_strides = {
+          static_cast<int>(tmp_grad_v.stride(0)),
+          static_cast<int>(tmp_grad_v.stride(1)),
+          static_cast<int>(tmp_grad_v.stride(2)),
+          static_cast<int>(tmp_grad_v.stride(3))};
+    } else {
+      p.grad_k_strides = {
+          static_cast<int>(grad_k.stride(0)),
+          static_cast<int>(grad_k.stride(1)),
+          static_cast<int>(grad_k.stride(2)),
+          static_cast<int>(grad_k.stride(3))};
+      p.grad_v_strides = {
+          static_cast<int>(grad_v.stride(0)),
+          static_cast<int>(grad_v.stride(1)),
+          static_cast<int>(grad_v.stride(2)),
+          static_cast<int>(grad_v.stride(3))};
+    };
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = bias->data_ptr();
+
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+
+      if (bias_requires_grad)
+        p.grad_bias_ptr = grad_bias.data_ptr();
+    } else {
+      p.has_attn_bias = true;
+      p.attn_bias_ptr = nullptr;
+      p.grad_bias_ptr = nullptr;
+    }
+
+    p.bias_has_grad = bias_requires_grad;
+
+    p.custom_mask_type = custom_mask_type;
+    p.window_size =
+        window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.philox_seed = rng_seed;
+    p.philox_offset = rng_offset;
+
+    p.logsumexp_ptr = logsumexp.data_ptr();
+    p.dot_out_ptr = dot_out.data_ptr();
+  };
+
+  auto set_grouped_backward_params = [&](GroupedBackwardParams& p) {
+    p.num_batches = seqstart_q->size(0) - 1;
+    p.M = M;
+    p.N = N;
+    p.Hq = Hq;
+    p.Hkv = Hkv;
+    p.K = K;
+    p.Kv = Kv;
+
+    p.is_mqa_gqa = is_mqa_gqa;
+
+    p.max_seqlen_q = *max_seqlen_q_;
+    p.max_seqlen_k = *max_seqlen_k_;
+
+    TORCH_CHECK(p.num_batches == logsumexp.size(0));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
+    TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
+
+    if (scale.has_value())
+      p.scale = float(*scale);
+    else
+      p.scale = float(1.0 / std::sqrt(float(K)));
+
+    p.q_strides = {
+        static_cast<int>(query.stride(1)),
+        static_cast<int>(query.stride(2)),
+        static_cast<int>(query.stride(3))};
+    p.k_strides = {
+        static_cast<int>(key.stride(1)),
+        static_cast<int>(key.stride(2)),
+        static_cast<int>(key.stride(3))};
+    p.v_strides = {
+        static_cast<int>(value.stride(1)),
+        static_cast<int>(value.stride(2)),
+        static_cast<int>(value.stride(3))};
+    p.out_strides = {
+        static_cast<int>(out.stride(1)),
+        static_cast<int>(out.stride(2)),
+        static_cast<int>(out.stride(3))};
+    p.grad_out_strides = {
+        static_cast<int>(grad_out.stride(1)),
+        static_cast<int>(grad_out.stride(2)),
+        static_cast<int>(grad_out.stride(3))};
+
+    p.lsed_strides = {
+        static_cast<int>(logsumexp.stride(0)),
+        static_cast<int>(logsumexp.stride(1)),
+        static_cast<int>(logsumexp.stride(2))};
+
+    if (is_mqa_gqa) {
+      p.grad_k_strides = {
+          static_cast<int>(tmp_grad_k.stride(1)),
+          static_cast<int>(tmp_grad_k.stride(2)),
+          static_cast<int>(tmp_grad_k.stride(3))};
+      p.grad_v_strides = {
+          static_cast<int>(tmp_grad_v.stride(1)),
+          static_cast<int>(tmp_grad_v.stride(2)),
+          static_cast<int>(tmp_grad_v.stride(3))};
+    } else {
+      p.grad_k_strides = {
+          static_cast<int>(grad_k.stride(1)),
+          static_cast<int>(grad_k.stride(2)),
+          static_cast<int>(grad_k.stride(3))};
+      p.grad_v_strides = {
+          static_cast<int>(grad_v.stride(1)),
+          static_cast<int>(grad_v.stride(2)),
+          static_cast<int>(grad_v.stride(3))};
+    };
+
+    if (bias.has_value()) {
+      CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA((*bias));
+      TORCH_CHECK(bias->scalar_type() == query.scalar_type());
+
+      p.has_attn_bias = true;
+      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, Hq, M, N);
+      p.attn_bias_strides = {
+          static_cast<int>(bias_4d_view.stride(0)),
+          static_cast<int>(bias_4d_view.stride(1)),
+          static_cast<int>(bias_4d_view.stride(2)),
+          static_cast<int>(bias_4d_view.stride(3))};
+    } else
+      p.has_attn_bias = false;
+
+    p.bias_has_grad = bias_requires_grad;
+
+    p.custom_mask_type = custom_mask_type;
+    p.window_size =
+        window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
+
+    // interesting: the tensors have to be defined here, moving to more local
+    // scope will cause issue
+    at::Tensor dev_seqstart_q;
+    at::Tensor dev_seqstart_k;
+    at::Tensor dev_seqlen_k;
+
+    if (seqstart_q->is_cpu()) {
+      dev_seqstart_q = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+      p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
+      HIP_CALL_CHECK(hipMemcpyAsync(
+          p.seqstart_q_dev_ptr,
+          seqstart_q->data_ptr(),
+          (p.num_batches + 1) * sizeof(int),
+          hipMemcpyHostToDevice,
+          stream));
+    } else
+      p.seqstart_q_dev_ptr = seqstart_q->data_ptr();
+
+    if (seqstart_k->is_cpu()) {
+      dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
+
+      p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
+      HIP_CALL_CHECK(hipMemcpyAsync(
+          p.seqstart_k_dev_ptr,
+          seqstart_k->data_ptr(),
+          (p.num_batches + 1) * sizeof(int),
+          hipMemcpyHostToDevice,
+          stream));
+    } else
+      p.seqstart_k_dev_ptr = seqstart_k->data_ptr();
+
+    if (seqlen_k.has_value()) {
+      TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
+      TORCH_CHECK(seqlen_k->dim() == 1);
+      TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+
+      if (seqlen_k->is_cpu()) {
+        dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
+
+        p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
+        HIP_CALL_CHECK(hipMemcpyAsync(
+            p.seqlen_k_dev_ptr,
+            seqlen_k->data_ptr(),
+            p.num_batches * sizeof(int),
+            hipMemcpyHostToDevice,
+            stream));
+      } else
+        p.seqlen_k_dev_ptr = seqlen_k->data_ptr();
+    } else
+      p.seqlen_k_dev_ptr = nullptr;
+
+    p.dropout_prob = static_cast<float>(dropout_p);
+    p.philox_seed = rng_seed;
+    p.philox_offset = rng_offset;
+
+    p.q_ptr = query.data_ptr();
+    p.k_ptr = key.data_ptr();
+    p.v_ptr = value.data_ptr();
+
+    p.out_ptr = out.data_ptr();
+    p.grad_out_ptr = grad_out.data_ptr();
+    p.attn_bias_ptr = bias.has_value() ? bias->data_ptr() : nullptr;
+
+    p.logsumexp_ptr = logsumexp.data_ptr();
+    p.dot_out_ptr = dot_out.data_ptr();
+
+    p.grad_q_ptr = grad_q.data_ptr();
+    p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
+    p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
+    p.grad_bias_ptr = bias_requires_grad ? grad_bias.data_ptr() : nullptr;
+  };
+
+  auto inDataType = query.scalar_type();
+
+  if (!seqstart_q.has_value()) { // input is batched
+    BatchedBackwardParams batched_backward_params;
+
+    set_batched_backward_params(batched_backward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      batched_backward_fp16(batched_backward_params, stream);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      batched_backward_bp16(batched_backward_params, stream);
+    } else
+      throw std::runtime_error("input data-type is not supported");
+  } else { // input is grouped
+    GroupedBackwardParams grouped_backward_params;
+
+    set_grouped_backward_params(grouped_backward_params);
+
+    if (inDataType == at::ScalarType::Half) {
+      grouped_backward_fp16(grouped_backward_params, stream);
+    } else if (inDataType == at::ScalarType::BFloat16) {
+      grouped_backward_bp16(grouped_backward_params, stream);
+    } else
+      throw std::runtime_error("input data-type is not supported");
+  }
+
+  if (is_mqa_gqa) {
+    auto tmp_grad_k_view = tmp_grad_k.unflatten(2, {Hkv, Hq / Hkv});
+    auto tmp_grad_v_view = tmp_grad_v.unflatten(2, {Hkv, Hq / Hkv});
+    grad_k = tmp_grad_k_view.sum(3);
+    grad_v = tmp_grad_v_view.sum(3);
+  }
+
+  return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
+      TORCH_FN(efficient_attention_backward_ck));
+}
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 786dfec0b..6fe0137b0 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -270,9 +270,9 @@ int main(int argc, char** argv) {
     const int32_t n_heads = std::stoi(args[3]);
     const int32_t n_groups = 1;
     const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32") ? torch::kFloat32
-        : (args[5] == "f16")              ? torch::kFloat16
-                                          : torch::kBFloat16;
+    const auto dtype = (args[5] == "f32")
+        ? torch::kFloat32
+        : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[6]);
 
     const int32_t dim_per_head = 4 * kThreadsPerWavefront;
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index a56b87f73..88e195c2d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -124,7 +124,6 @@ efficient_attention_forward_ck(
   int64_t philox_offset;
 
   if (use_dropout) {
-    /*
     at::PhiloxCudaState rng_engine_inputs;
     at::CUDAGeneratorImpl* gen =
         at::get_generator_or_default<at::CUDAGeneratorImpl>(
@@ -139,9 +138,6 @@ efficient_attention_forward_ck(
 
     philox_seed = std::get<0>(seeds);
     philox_offset = std::get<1>(seeds);
-    */
-    throw std::runtime_error(
-        "drop-out is currently not implemented by ck-tiled!");
   }
 
   auto set_batched_forward_params = [&](BatchedForwardParams& p) {
@@ -212,17 +208,21 @@ efficient_attention_forward_ck(
 
     // the following parameters are only used by training forward
     if (p.use_dropout) {
-      // p.dropout_prob = static_cast<float>(dropout_p);
-      throw std::runtime_error(
-          "drop-out is currently not implemented by ck-tiled!");
+      p.dropout_prob = static_cast<float>(dropout_p);
     } else
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
       logsumexp = at::empty({B, Hq, M}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
-    } else
+      p.lse_strides = {
+          static_cast<int>(logsumexp.stride(0)),
+          static_cast<int>(logsumexp.stride(1)),
+          static_cast<int>(logsumexp.stride(2))};
+    } else {
       p.logsumexp_ptr = nullptr;
+      p.lse_strides = {0, 0, 0};
+    }
   };
 
   auto set_grouped_forward_params = [&](GroupedForwardParams& p) {
@@ -234,6 +234,8 @@ efficient_attention_forward_ck(
     p.K = K;
     p.Kv = Kv;
 
+    p.max_seqlen_q = *max_seqlen_q_;
+
     if (scale.has_value()) {
       p.scale = float(*scale);
     } else {
@@ -282,9 +284,6 @@ efficient_attention_forward_ck(
     p.window_size =
         window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
 
-    // max_seqlen_q is used to create logsumexp tensor
-    p.max_seqlen_q = *max_seqlen_q_;
-
     // interesting: the tensors have to be defined here, moving to more local
     // scope will cause issue
     at::Tensor dev_seqstart_q;
@@ -343,9 +342,7 @@ efficient_attention_forward_ck(
 
     // the following parameters are only used by training forward
     if (p.use_dropout) {
-      // p.dropout_prob = static_cast<float>(dropout_p);
-      throw std::runtime_error(
-          "drop-out is currently not implemented by ck-tiled!");
+      p.dropout_prob = static_cast<float>(dropout_p);
     } else
       p.dropout_prob = 0.0f;
 
@@ -353,8 +350,14 @@ efficient_attention_forward_ck(
       logsumexp = at::empty(
           {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
-    } else
+      p.lse_strides = {
+          static_cast<int>(logsumexp.stride(0)),
+          static_cast<int>(logsumexp.stride(1)),
+          static_cast<int>(logsumexp.stride(2))};
+    } else {
       p.logsumexp_ptr = nullptr;
+      p.lse_strides = {0, 0, 0};
+    }
   };
 
   auto inDataType = query.scalar_type();
@@ -379,9 +382,6 @@ efficient_attention_forward_ck(
         batched_forward_bp16(batched_forward_params, stream);
       } else
         throw std::runtime_error("input data-type is not supported!");
-
-      throw std::runtime_error(
-          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
     };
   } else { // input is grouped
     GroupedForwardParams grouped_forward_params;
@@ -403,9 +403,6 @@ efficient_attention_forward_ck(
         grouped_forward_bp16(grouped_forward_params, stream);
       } else
         throw std::runtime_error("input data-type is not supported!");
-
-      throw std::runtime_error(
-          "drop-out and compuate logsumexp currently not implemented by ck-tiled!");
     };
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index ea4e3505f..0c2740063 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -555,22 +555,22 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
                     kMaxKVSequenceLength,
                     compute_t>
               : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    2,
-                    kLoopUnroll,
-                    kLoopUnrollTail,
-                    kMaxKVSequenceLength,
-                    compute_t>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    1,
-                    kLoopUnroll,
-                    kLoopUnrollTail,
-                    kMaxKVSequenceLength,
-                    compute_t>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                        scalar_t,
+                        2,
+                        kLoopUnroll,
+                        kLoopUnrollTail,
+                        kMaxKVSequenceLength,
+                        compute_t>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                            scalar_t,
+                            1,
+                            kLoopUnroll,
+                            kLoopUnrollTail,
+                            kMaxKVSequenceLength,
+                            compute_t>
+                      : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -728,14 +728,14 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : O_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                    scalar_t,
-                    2>
-              : O_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                    scalar_t,
-                    1>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                        scalar_t,
+                        2>
+                  : O_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
@@ -1114,9 +1114,9 @@ int main(int argc, char** argv) {
     const int32_t batch_size = std::stoi(args[1]);
     const int32_t nq_heads = std::stoi(args[2]);
     const int32_t nkv_heads = std::stoi(args[3]);
-    const auto dtype = (args[4] == "f32") ? torch::kFloat32
-        : (args[4] == "f16")              ? torch::kFloat16
-                                          : torch::kBFloat16;
+    const auto dtype = (args[4] == "f32")
+        ? torch::kFloat32
+        : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[5]);
 
     auto [Q, K, V, seq] =
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 20b3b8979..57d54eda2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -458,10 +458,12 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           Q_size_k_alignment_necessary == 4
               ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
               : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 65c27603d..3efe1385c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -622,22 +622,22 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     KV_M_MAX,
                     compute_t>
               : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    /* vec_size */ 2,
-                    n_loop_unroll,
-                    n_loop_unroll_tail,
-                    KV_M_MAX,
-                    compute_t>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                    scalar_t,
-                    /* vec_size */ 1,
-                    n_loop_unroll,
-                    n_loop_unroll_tail,
-                    KV_M_MAX,
-                    compute_t>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                        scalar_t,
+                        /* vec_size */ 2,
+                        n_loop_unroll,
+                        n_loop_unroll_tail,
+                        KV_M_MAX,
+                        compute_t>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                            scalar_t,
+                            /* vec_size */ 1,
+                            n_loop_unroll,
+                            n_loop_unroll_tail,
+                            KV_M_MAX,
+                            compute_t>
+                      : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -676,14 +676,14 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                    scalar_t,
-                    2>
-              : Q_size_k_alignment_necessary == 1
-              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                    scalar_t,
-                    1>
-              : nullptr,
+                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                        scalar_t,
+                        2>
+                  : Q_size_k_alignment_necessary == 1
+                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                            scalar_t,
+                            1>
+                      : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
new file mode 100644
index 000000000..84ea5f423
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <ck/host_utility/device_prop.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
+#include <ck/tensor_description/cluster_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/utility/common_header.hpp>
+
+#include <ck/tile_program/block_tile/block_masking.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dot_do_o.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_dispatcher.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_problem.hpp>
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_bwd_setting.h"
+#include "ck_tiled_fmha_params.h"
+
+#include "ck_tiled_fmha_backward_kernel.hpp"
+#include "ck_tiled_fmha_bwd_epilogue.hpp"
+#include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
+#include "ck_tiled_fmha_definitions.hpp"
+
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t MaxK>
+struct batched_backward_causalmask_attnbias_dispatched {
+  using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
+      typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
+      typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
+      typename FmhaBwdTypeConfig<scalar_t>::VGradDataType>>;
+
+  using FmhaBwdLoadStrategy_ = typename FmhaBwdLoadStrategy<MaxK>::type;
+
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaBwdPipelineProblemTemp =
+      ck::tile_program::block::BlockFmhaBwdPipelineProblem<
+          typename FmhaBwdTypeConfig<scalar_t>::QDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::KDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::VDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::GemmDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::LSEDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::DDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::ODataType,
+          typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::QGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::VGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::BiasGradDataType,
+          FmhaBwdShape<MaxK>,
+          false, // kIsGroupMode
+          FmhaMask,
+          FmhaTraits>;
+
+  static void Run(BatchedBackwardParams& param, hipStream_t stream) {
+    {
+      constexpr ck::index_t kBlockSize = 256;
+
+      const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
+      const bool pad_headdim_v =
+          !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
+
+      BOOL_SWITCH_2(
+          pad_seqlen_q, kPadSeqLenQ, pad_headdim_v, kPadHeadDimV, [&] {
+            constexpr ck::index_t occupancy = 2;
+
+            using FmhaOGradDotOTraits_ =
+                ck::tile_program::TileFmhaBwdOGradDotOTraits<
+                    kPadSeqLenQ,
+                    kPadHeadDimV,
+                    occupancy>;
+
+            using FmhaBwdOGradDotOPipelineProblem =
+                ck::tile_program::block::BlockFmhaBwdOGradDotOPipelineProblem<
+                    typename FmhaBwdTypeConfig<scalar_t>::ODataType,
+                    typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
+                    typename FmhaBwdTypeConfig<scalar_t>::DDataType,
+                    kBlockSize,
+                    FmhaBwdShape<MaxK>::kVHeaddim,
+                    false, // kIsGroupMode
+                    FmhaOGradDotOTraits_>;
+
+            using FmhaBwdOGradDotOPipeline =
+                typename ck::tile_program::block::BlockFmhaBwdOGradDotO<
+                    FmhaBwdOGradDotOPipelineProblem>;
+
+            using FmhaBwdOGradDotOKernel_ = FmhaBwdOGradDotOKernel<
+                FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
+                FmhaBwdOGradDotOPipeline>;
+
+            RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
+          });
+    }
+
+    {
+      const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+      BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+        constexpr ck::index_t occupancy = 1;
+        constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+        const bool has_dropout = (param.dropout_prob > 0.0f);
+
+        using FmhaMask = ck::tile_program::block::
+            GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+        using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
+        using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
+
+        const bool pad_seqlen_q = !(param.M % FmhaBwdShape_::kM0 == 0);
+        const bool pad_seqlen_k = !(param.N % FmhaBwdShape_::kN0 == 0);
+        // const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
+        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kK2 == 0);
+
+        // usually headdim_q and headdim_v are same, consider them together
+        // to determine whether to do padding saving some compiling time
+        // bool pad_headdim = (pad_headdim_q || pad_headdim_v);
+
+        // currently headdim padding is not supported due to some atomic_add
+        // issue with bhalf_t
+        constexpr bool kPadHeadDimQ = false;
+
+        BOOL_SWITCH_4(
+            has_dropout,
+            kHasDropout,
+            pad_seqlen_q,
+            kPadSeqLenQ,
+            pad_seqlen_k,
+            kPadSeqLenK,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  false, // kStoreLSE
+                  kHasDropout,
+                  occupancy>;
+
+              using FmhaBwdPipelineProblem =
+                  FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
+
+              using FmhaBwdPipeline_ = typename ck::tile_program::block::
+                  BlockFmhaBwdPipelineDispatcher<
+                      FmhaBwdLoadStrategy_,
+                      FmhaBwdPipelineProblem>::BlockPipeline;
+
+              using FmhaBwdKernel_ = FmhaBwdKernel<
+                  FmhaBwdTilePartitioner_,
+                  FmhaBwdPipeline_,
+                  FmhaBwdEpilogue_>;
+
+              RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
+            });
+      });
+    };
+  }
+
+  template <typename FmhaBwdOGradDotOKernel>
+  static void RunWithBwdOGradDotOKernel(
+      BatchedBackwardParams& param,
+      hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaBwdOGradDotOKernel::MakeKargs(
+          param.out_ptr,
+          param.grad_out_ptr,
+          param.dot_out_ptr,
+          param.M,
+          param.Kv,
+          param.grad_out_strides[1], // stride_do
+          param.out_strides[1], // stride_o
+          param.out_strides[2], // nhead_stride_do
+          param.out_strides[2], // nhead_stride_o
+          param.lsed_strides[1], // nhead_stride_d
+          param.out_strides[0], // batch_stride_do
+          param.out_strides[0], // batch_stride_o
+          param.lsed_strides[0]); // batch_stride_d
+    }();
+
+    dim3 kGridSize =
+        FmhaBwdOGradDotOKernel::GridSize(param.B, param.Hq, param.M);
+    constexpr dim3 kBlockSize = FmhaBwdOGradDotOKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdOGradDotOKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaBwdOGradDotOKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  }
+
+  template <typename FmhaBwdKernel>
+  static void RunWithBwdKernel(
+      BatchedBackwardParams& param,
+      hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaBwdKernel::MakeKargs(
+          param.q_ptr,
+          param.k_ptr,
+          param.v_ptr,
+          param.attn_bias_ptr,
+          param.logsumexp_ptr,
+          param.grad_out_ptr,
+          param.dot_out_ptr,
+          param.grad_q_ptr,
+          param.grad_k_ptr,
+          param.grad_v_ptr,
+          param.grad_bias_ptr,
+          param.M, // seqlen_q
+          param.N, // seqlen_k
+          param.Hq, // nhead_q
+          param.Hkv, // nhead_v
+          param.Hq / param.Hkv,
+          param.scale,
+          param.q_strides[1], // q, k, v, bias, do, o, dk, dv, dbias seq-dim
+                              // stride
+          param.k_strides[1],
+          param.v_strides[1],
+          param.attn_bias_strides[2],
+          param.grad_out_strides[1],
+          param.grad_k_strides[1],
+          param.grad_v_strides[1],
+          param.attn_bias_strides[2], // assume grad_bias has same strides as
+                                      // bias
+          param.q_strides[2], // q, k, v, bias, do, o, lse/dot, dbias
+                              // nhead-dim strides
+          param.k_strides[2],
+          param.v_strides[2],
+          param.attn_bias_strides[1],
+          param.grad_out_strides[2],
+          param.lsed_strides[1],
+          param.attn_bias_strides[1], // assume grad_bias has same strides as
+                                      // bias
+          param.q_strides[0], // q, k, v, bias, do, o, lse/dot, dk, dv, dbias,
+                              // batch-dim strides
+          param.k_strides[0],
+          param.v_strides[0],
+          param.attn_bias_strides[0],
+          param.grad_out_strides[0],
+          param.lsed_strides[0], // lse/dot is in BHM contiguous layout
+          param.grad_k_strides[0],
+          param.grad_v_strides[0],
+          param.attn_bias_strides[0], // assume grad_bias has same strides as
+                                      // bias
+          static_cast<CausalMaskType>(param.custom_mask_type),
+          param.window_size);
+    }();
+
+    dim3 kGridSize = FmhaBwdKernel::GridSize(param.B, param.Hq, param.N);
+    constexpr dim3 kBlockSize = FmhaBwdKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaBwdKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  }
+};
+
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t MaxK>
+void run_batched_backward_causalmask_attnbias_dispatched(
+    BatchedBackwardParams& param,
+    hipStream_t stream) {
+  batched_backward_causalmask_attnbias_dispatched<
+      scalar_t,
+      has_causal_mask,
+      has_attn_bias,
+      MaxK>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
new file mode 100644
index 000000000..bbcbe8784
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_batched_backward.h"
+#include "ck_tiled_headdim_switch.h"
+
+// clang-format off
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+// clang-format on
+
+void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+      if (param.custom_mask_type == 0)
+        run_batched_backward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            false,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_batched_backward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            true,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
+    });
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
new file mode 100644
index 000000000..35df8c293
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_batched_backward.h"
+#include "ck_tiled_headdim_switch.h"
+
+// clang-format off
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+// clang-format on
+
+void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+      if (param.custom_mask_type == 0)
+        run_batched_backward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            false,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_batched_backward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            true,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
+    });
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 61cdcd124..617ebd762 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -23,7 +23,6 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
-#include "ck_tiled_headdim_switch.h"
 
 #include "ck_tiled_fmha_definitions.hpp"
 #include "ck_tiled_fmha_forward_kernel.hpp"
@@ -36,7 +35,7 @@ template <
     bool has_attn_bias,
     ck::index_t MaxK>
 struct batched_forward_causalmask_attnbias_dispatched {
-  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+  using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
       typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
       typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
@@ -64,111 +63,118 @@ struct batched_forward_causalmask_attnbias_dispatched {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-      using FmhaShape = FmhaFwdShape<MaxK>;
-      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+      using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
+      using FmhaFwdTilePartitioner_ = FmhaFwdTilePartitioner<FmhaFwdShape_>;
       constexpr ck::index_t occupancy =
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
-      bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
-      bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
-      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+      const bool pad_seqlen_q = !(param.M % FmhaFwdShape_::kM0 == 0);
+      const bool pad_seqlen_k = !(param.N % FmhaFwdShape_::kN0 == 0);
+      const bool pad_headdim_q =
+          !(param.K % FmhaFwdShape_::kK0BlockLength == 0);
+      const bool pad_headdim_v = !(param.Kv % FmhaFwdShape_::kN1 == 0);
+
+      // usually headdim_q and headdim_v are same, consider them together to
+      // determine whether to do padding saving some compiling time
+      bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
       if constexpr (MaxK == 256) {
         BOOL_SWITCH_4(
+            has_dropout,
+            kHasDropout,
             pad_seqlen_q,
             kPadSeqLenQ,
             pad_seqlen_k,
             kPadSeqLenK,
-            pad_headdim_q,
-            kPadHeadDimQ,
-            pad_headdim_v,
-            kPadHeadDimV,
+            pad_headdim,
+            kPadHeadDim,
             [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
+                  kPadHeadDim, // kPadHeadDimQ
+                  kPadHeadDim, // kPadHeadDimV
                   has_attn_bias,
                   true, // kStoreLSE
-                  false, // kHadDropout, to be changed
+                  kHasDropout,
                   occupancy>;
 
               using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
 
-              using FmhaPipeline =
+              using FmhaFwdPipeline_ =
                   ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
-              using FmhaKernel = FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
 
-              RunWithKernel<FmhaKernel>(param, stream);
+              using FmhaFwdKernel_ = FmhaFwdKernel<
+                  FmhaFwdTilePartitioner_,
+                  FmhaFwdPipeline_,
+                  FmhaFwdEpilogue_>;
+
+              RunWithKernel<FmhaFwdKernel_>(param, stream);
             });
       } else {
         BOOL_SWITCH_4(
+            has_dropout,
+            kHasDropout,
             pad_seqlen_q,
             kPadSeqLenQ,
             pad_seqlen_k,
             kPadSeqLenK,
-            pad_headdim_q,
-            kPadHeadDimQ,
-            pad_headdim_v,
-            kPadHeadDimV,
+            pad_headdim,
+            kPadHeadDim,
             [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
+                  kPadHeadDim, // kPadHeadDimQ
+                  kPadHeadDim, // kPadHeadDimV
                   has_attn_bias,
                   true, // kStoreLSE
-                  false, // kHadDropout, to be changed
+                  kHasDropout,
                   occupancy>;
 
               using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
 
               constexpr bool no_any_padding =
-                  !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
+                  !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDim);
 
               if constexpr (no_any_padding) {
-                using FmhaPipeline =
+                using FmhaFwdPipeline_ =
                     ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
                         FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<
-                    FmhaTilePartitioner,
-                    FmhaPipeline,
-                    FmhaEpilogue>;
+                using FmhaFwdKernel_ = FmhaFwdKernel<
+                    FmhaFwdTilePartitioner_,
+                    FmhaFwdPipeline_,
+                    FmhaFwdEpilogue_>;
 
-                RunWithKernel<FmhaKernel>(param, stream);
+                RunWithKernel<FmhaFwdKernel_>(param, stream);
               } else {
-                using FmhaPipeline =
+                using FmhaFwdPipeline_ =
                     ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                         FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<
-                    FmhaTilePartitioner,
-                    FmhaPipeline,
-                    FmhaEpilogue>;
+                using FmhaFwdKernel_ = FmhaFwdKernel<
+                    FmhaFwdTilePartitioner_,
+                    FmhaFwdPipeline_,
+                    FmhaFwdEpilogue_>;
 
-                RunWithKernel<FmhaKernel>(param, stream);
+                RunWithKernel<FmhaFwdKernel_>(param, stream);
               };
             });
       };
     });
   };
 
-  template <typename FmhaKernel>
+  template <typename FmhaFwdKernel>
   static void RunWithKernel(BatchedForwardParams& param, hipStream_t stream) {
     const auto kargs = [&] {
-      return FmhaKernel::MakeKargs(
+      return FmhaFwdKernel::MakeKargs(
           param.q_ptr,
           param.k_ptr,
           param.v_ptr,
@@ -196,7 +202,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
           param.v_strides[2],
           param.attn_bias_strides[1],
           0, // nhead_randval
-          param.M, // nhead_stride_lse
+          param.lse_strides[1], // nhead_stride_lse
           param.out_strides[2],
           param.q_strides[0], // q, k, v, bias, randval, lse, out tensor
                               // batch-dim stride
@@ -204,7 +210,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
           param.v_strides[0],
           param.attn_bias_strides[0],
           0, // batch_stride_randval
-          param.Hq * param.M, // batch_stride_lse
+          param.lse_strides[0], // batch_stride_lse
           param.out_strides[0],
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
@@ -215,13 +221,14 @@ struct batched_forward_causalmask_attnbias_dispatched {
           {param.philox_seed, param.philox_offset});
     }();
 
-    dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
-    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+    dim3 kGridSize =
+        FmhaFwdKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
+    constexpr dim3 kBlockSize = FmhaFwdKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaFwdKernel::kBlockPerCu;
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
-        FmhaKernel{},
+        FmhaFwdKernel{},
         kGridSize,
         kBlockSize,
         0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
index 8d90c7cd5..774e2974c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -10,6 +10,7 @@
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_forward.h"
+#include "ck_tiled_headdim_switch.h"
 
 // clang-format off
 extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 3e6584971..4e194c3e7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -10,6 +10,7 @@
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_forward.h"
+#include "ck_tiled_headdim_switch.h"
 
 // clang-format off
 extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
new file mode 100644
index 000000000..1d004dc8a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+
+template <typename DataType>
+struct FmhaBwdTypeConfig;
+
+template <>
+struct FmhaBwdTypeConfig<ck::half_t> {
+  using QDataType = ck::half_t;
+  using KDataType = ck::half_t;
+  using VDataType = ck::half_t;
+  using GemmDataType = ck::half_t;
+  using BiasDataType = ck::half_t;
+  using RandValOutputDataType = unsigned short;
+  using LSEDataType = float;
+  using AccDataType = float; // data type for gemm accumulation
+  using DDataType = float;
+  using ODataType = ck::half_t;
+  using OGradDataType = ck::half_t;
+  using QGradDataType = ck::half_t;
+  using KGradDataType = ck::half_t;
+  using VGradDataType = ck::half_t;
+  using BiasGradDataType = ck::half_t;
+};
+
+template <>
+struct FmhaBwdTypeConfig<ck::bhalf_t> {
+  using QDataType = ck::bhalf_t;
+  using KDataType = ck::bhalf_t;
+  using VDataType = ck::bhalf_t;
+  using GemmDataType = ck::bhalf_t;
+  using BiasDataType = ck::bhalf_t;
+  using RandValOutputDataType = unsigned short;
+  using LSEDataType = float;
+  using AccDataType = float; // data type for gemm accumulation
+  using DDataType = float;
+  using ODataType = ck::bhalf_t;
+  using OGradDataType = ck::bhalf_t;
+  using QGradDataType = ck::bhalf_t;
+  using KGradDataType = ck::bhalf_t;
+  using VGradDataType = ck::bhalf_t;
+  using BiasGradDataType = ck::bhalf_t;
+};
+
+template <ck::index_t MaxK>
+struct FmhaBwdLoadStrategy;
+
+template <>
+struct FmhaBwdLoadStrategy<32> {
+  using type = ck::Sequence<true, false, true, false, true, true, false>;
+};
+
+template <>
+struct FmhaBwdLoadStrategy<64> {
+  using type = ck::Sequence<false, false, true, true, true, false, false>;
+};
+
+template <>
+struct FmhaBwdLoadStrategy<128> {
+  using type = ck::Sequence<false, false, true, false, true, false, false>;
+};
+
+template <ck::index_t MaxK>
+struct FmhaBwdBlockTile;
+
+template <>
+struct FmhaBwdBlockTile<32> {
+  using type = ck::Sequence<128, 128, 32, 32, 32, 32, 32, 32, 32>;
+};
+
+template <>
+struct FmhaBwdBlockTile<64> {
+  using type = ck::Sequence<64, 128, 32, 32, 32, 32, 32, 64, 64>;
+};
+
+template <>
+struct FmhaBwdBlockTile<128> {
+  using type = ck::Sequence<64, 128, 32, 32, 32, 32, 32, 128, 128>;
+};
+
+using FmhaBwdBlockWarps0 = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
+using FmhaBwdBlockWarps1 = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
+using FmhaBwdBlockWarps2 = ck::Sequence<2, 2, 1>; // default for gemm4
+using FmhaBwdWarpTile = ck::Sequence<32, 32, 16>;
+
+template <ck::index_t MaxK>
+struct FmhaBwdShape;
+
+template <>
+struct FmhaBwdShape<32> : ck::tile_program::TileFmhaBwdShape<
+                              typename FmhaBwdBlockTile<32>::type,
+                              typename FmhaBwdLoadStrategy<32>::type,
+                              FmhaBwdBlockWarps0,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps1,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps0,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps1,
+                              FmhaBwdWarpTile,
+                              ck::Sequence<4, 1, 1>,
+                              FmhaBwdWarpTile> {};
+
+template <>
+struct FmhaBwdShape<64> : ck::tile_program::TileFmhaBwdShape<
+                              typename FmhaBwdBlockTile<64>::type,
+                              typename FmhaBwdLoadStrategy<64>::type,
+                              FmhaBwdBlockWarps0,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps1,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps0,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps1,
+                              FmhaBwdWarpTile,
+                              FmhaBwdBlockWarps2,
+                              FmhaBwdWarpTile> {};
+
+template <>
+struct FmhaBwdShape<128> : ck::tile_program::TileFmhaBwdShape<
+                               typename FmhaBwdBlockTile<128>::type,
+                               typename FmhaBwdLoadStrategy<128>::type,
+                               FmhaBwdBlockWarps0,
+                               FmhaBwdWarpTile,
+                               FmhaBwdBlockWarps1,
+                               FmhaBwdWarpTile,
+                               FmhaBwdBlockWarps0,
+                               FmhaBwdWarpTile,
+                               FmhaBwdBlockWarps1,
+                               FmhaBwdWarpTile,
+                               FmhaBwdBlockWarps2,
+                               FmhaBwdWarpTile> {};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
new file mode 100644
index 000000000..7fab9f2c8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <ck/host_utility/device_prop.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+#include <ck/tensor/tensor_view.hpp>
+#include <ck/tensor_description/cluster_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/utility/common_header.hpp>
+
+#include <ck/tile_program/block_tile/block_masking.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dot_do_o.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_dispatcher.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_problem.hpp>
+#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_bwd_setting.h"
+#include "ck_tiled_fmha_params.h"
+
+#include "ck_tiled_fmha_backward_kernel.hpp"
+#include "ck_tiled_fmha_bwd_epilogue.hpp"
+#include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
+#include "ck_tiled_fmha_definitions.hpp"
+
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t MaxK>
+struct grouped_backward_causalmask_attnbias_dispatched {
+  using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
+      typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
+      typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
+      typename FmhaBwdTypeConfig<scalar_t>::VGradDataType>>;
+
+  using FmhaBwdLoadStrategy_ = typename FmhaBwdLoadStrategy<MaxK>::type;
+
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaBwdPipelineProblemTemp =
+      ck::tile_program::block::BlockFmhaBwdPipelineProblem<
+          typename FmhaBwdTypeConfig<scalar_t>::QDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::KDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::VDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::GemmDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::LSEDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::DDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::ODataType,
+          typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::QGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::VGradDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::BiasGradDataType,
+          FmhaBwdShape<MaxK>,
+          true, // kIsGroupMode
+          FmhaMask,
+          FmhaTraits>;
+
+  static void Run(GroupedBackwardParams& param, hipStream_t stream) {
+    {
+      constexpr ck::index_t kBlockSize = 256;
+      bool pad_seqlen_q = !(param.M % kBlockSize == 0);
+      bool pad_headdim_v = !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
+
+      BOOL_SWITCH_2(
+          pad_seqlen_q, kPadSeqLenQ, pad_headdim_v, kPadHeadDimV, [&] {
+            constexpr ck::index_t occupancy = 2;
+
+            using FmhaOGradDotOTraits_ =
+                ck::tile_program::TileFmhaBwdOGradDotOTraits<
+                    kPadSeqLenQ,
+                    kPadHeadDimV,
+                    occupancy>;
+
+            using FmhaBwdOGradDotOPipelineProblem =
+                ck::tile_program::block::BlockFmhaBwdOGradDotOPipelineProblem<
+                    typename FmhaBwdTypeConfig<scalar_t>::ODataType,
+                    typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
+                    typename FmhaBwdTypeConfig<scalar_t>::DDataType,
+                    kBlockSize,
+                    FmhaBwdShape<MaxK>::kVHeaddim,
+                    true, // kIsGroupMode
+                    FmhaOGradDotOTraits_>;
+
+            using FmhaBwdOGradDotOPipeline_ =
+                typename ck::tile_program::block::BlockFmhaBwdOGradDotO<
+                    FmhaBwdOGradDotOPipelineProblem>;
+
+            using FmhaBwdOGradDotOKernel_ = FmhaBwdOGradDotOKernel<
+                FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
+                FmhaBwdOGradDotOPipeline_>;
+
+            RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
+          });
+    };
+
+    {
+      const bool has_local_attention = (param.window_size > 0) ? true : false;
+
+      BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
+        constexpr ck::index_t occupancy = 1;
+        constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+        const bool has_dropout = (param.dropout_prob > 0.0f);
+
+        using FmhaMask = ck::tile_program::block::
+            GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+
+        using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
+        using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
+
+        constexpr bool kPadSeqLenQ = true;
+        constexpr bool kPadSeqLenK = true;
+
+        // const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
+        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kK2 == 0);
+
+        // currently headdim padding is not supported due to some atomic_add
+        // issue with bhalf_t
+        constexpr bool kPadHeadDimQ = false;
+
+        BOOL_SWITCH_2(
+            has_dropout, kHasDropout, pad_headdim_v, kPadHeadDimV, [&] {
+              using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  has_attn_bias,
+                  false, // kStoreLSE
+                  kHasDropout,
+                  occupancy>;
+
+              using FmhaBwdPipelineProblem =
+                  FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
+
+              using FmhaBwdPipeline_ = typename ck::tile_program::block::
+                  BlockFmhaBwdPipelineDispatcher<
+                      FmhaBwdLoadStrategy_,
+                      FmhaBwdPipelineProblem>::BlockPipeline;
+
+              using FmhaBwdKernel_ = FmhaBwdKernel<
+                  FmhaBwdTilePartitioner_,
+                  FmhaBwdPipeline_,
+                  FmhaBwdEpilogue_>;
+
+              RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
+            });
+      });
+    };
+  }
+
+  template <typename FmhaBwdOGradDotOKernel>
+  static void RunWithBwdOGradDotOKernel(
+      GroupedBackwardParams& param,
+      hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaBwdOGradDotOKernel::MakeKargs(
+          param.out_ptr,
+          param.grad_out_ptr,
+          param.dot_out_ptr,
+          param.seqstart_q_dev_ptr,
+          param.Kv,
+          param.grad_out_strides[0], // stride_do
+          param.out_strides[0], // stride_o
+          param.grad_out_strides[1], // nhead_stride_do
+          param.out_strides[1], // nhead_stride_o
+          param.lsed_strides[1]);
+    }();
+
+    dim3 kGridSize = FmhaBwdOGradDotOKernel::GridSize(
+        param.num_batches, param.Hq, param.max_seqlen_q);
+    constexpr dim3 kBlockSize = FmhaBwdOGradDotOKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdOGradDotOKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaBwdOGradDotOKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  }
+
+  template <typename FmhaBwdKernel>
+  static void RunWithBwdKernel(
+      GroupedBackwardParams& param,
+      hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaBwdKernel::MakeKargs(
+          param.q_ptr,
+          param.k_ptr,
+          param.v_ptr,
+          param.attn_bias_ptr,
+          param.logsumexp_ptr,
+          param.grad_out_ptr,
+          param.dot_out_ptr,
+          param.grad_q_ptr,
+          param.grad_k_ptr,
+          param.grad_v_ptr,
+          param.grad_bias_ptr,
+          param.seqstart_q_dev_ptr,
+          param.seqstart_k_dev_ptr,
+          param.seqlen_k_dev_ptr,
+          param.Hq, // nhead_q
+          param.Hkv, // nhead_v
+          param.Hq / param.Hkv,
+          param.scale,
+          param.q_strides[0], // q, k, v, bias, do, o, dk, dv, dbias seq-dim
+                              // stride
+          param.k_strides[0],
+          param.v_strides[0],
+          param.attn_bias_strides[1],
+          param.grad_out_strides[0],
+          param.grad_k_strides[0],
+          param.grad_v_strides[0],
+          param.attn_bias_strides[1], // assume grad_bias has same strides as
+                                      // bias
+          param.q_strides[1], // q, k, v, bias, do, o, lse/dot, dbias
+                              // nhead-dim strides
+          param.k_strides[1],
+          param.v_strides[1],
+          param.attn_bias_strides[0],
+          param.grad_out_strides[1],
+          param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
+          param.attn_bias_strides[0], // assume grad_bias has same strides as
+                                      // bias
+          static_cast<CausalMaskType>(param.custom_mask_type),
+          param.window_size);
+    }();
+
+    dim3 kGridSize = FmhaBwdKernel::GridSize(
+        param.num_batches, param.Hq, param.max_seqlen_k);
+    constexpr dim3 kBlockSize = FmhaBwdKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdKernel::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaBwdKernel{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  }
+};
+
+template <
+    typename scalar_t,
+    bool has_causal_mask,
+    bool has_attn_bias,
+    ck::index_t MaxK>
+void run_grouped_backward_causalmask_attnbias_dispatched(
+    GroupedBackwardParams& param,
+    hipStream_t stream) {
+  grouped_backward_causalmask_attnbias_dispatched<
+      scalar_t,
+      has_causal_mask,
+      has_attn_bias,
+      MaxK>::Run(param, stream);
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
new file mode 100644
index 000000000..0553bbcb1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_grouped_backward.h"
+#include "ck_tiled_headdim_switch.h"
+
+// clang-format off
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+// clang-format on
+
+void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+      if (param.custom_mask_type == 0)
+        run_grouped_backward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            false,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_grouped_backward_causalmask_attnbias_dispatched<
+            ck::bhalf_t,
+            true,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
+    });
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
new file mode 100644
index 000000000..e4522de89
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <stdexcept>
+
+#include "ck_tiled_bool_switch.h"
+#include "ck_tiled_fmha_grouped_backward.h"
+#include "ck_tiled_headdim_switch.h"
+
+// clang-format off
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+// clang-format on
+
+void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
+  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+      if (param.custom_mask_type == 0)
+        run_grouped_backward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            false,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+        run_grouped_backward_causalmask_attnbias_dispatched<
+            ck::half_t,
+            true,
+            HAS_ATTN_BIAS,
+            MaxK>(param, stream);
+      else
+        throw std::runtime_error("Invalid custom_mask_type value");
+    });
+  });
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 78ed74316..548cd013d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -22,7 +22,6 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
-#include "ck_tiled_headdim_switch.h"
 
 #include "ck_tiled_fmha_definitions.hpp"
 #include "ck_tiled_fmha_forward_kernel.hpp"
@@ -35,7 +34,7 @@ template <
     bool has_attn_bias,
     ck::index_t MaxK>
 struct grouped_forward_causalmask_attnbias_dispatched {
-  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+  using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
       typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
       typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
 
@@ -63,81 +62,96 @@ struct grouped_forward_causalmask_attnbias_dispatched {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
 
-      using FmhaShape = FmhaFwdShape<MaxK>;
-      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
+      using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
+      using FmhaFwdTilePartitioner_ = FmhaFwdTilePartitioner<FmhaFwdShape_>;
       constexpr ck::index_t occupancy =
           (MaxK == 64) ? 3 : (MaxK == 256) ? 1 : 2;
 
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
 
-      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
-      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+      const bool pad_headdim_q =
+          !(param.K % FmhaFwdShape_::kK0BlockLength == 0);
+      const bool pad_headdim_v = !(param.Kv % FmhaFwdShape_::kN1 == 0);
 
       if constexpr (MaxK == 256) {
-        BOOL_SWITCH_2(
-            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+        BOOL_SWITCH_3(
+            has_dropout,
+            kHasDropout,
+            pad_headdim_q,
+            kPadHeadDimQ,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
                   kPadHeadDimQ,
                   kPadHeadDimV,
                   has_attn_bias,
                   true, // kStoreLSE
-                  false, // kHadDropout, to be changed
+                  kHasDropout,
                   occupancy>;
 
               using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
 
-              using FmhaPipeline =
+              using FmhaFwdPipeline_ =
                   ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
-              using FmhaKernel = FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
 
-              RunWithKernel<FmhaKernel>(param, stream);
+              using FmhaFwdKernel_ = FmhaFwdKernel<
+                  FmhaFwdTilePartitioner_,
+                  FmhaFwdPipeline_,
+                  FmhaFwdEpilogue_>;
+
+              RunWithKernel<FmhaFwdKernel_>(param, stream);
             });
       } else {
-        BOOL_SWITCH_2(
-            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+        BOOL_SWITCH_3(
+            has_dropout,
+            kHasDropout,
+            pad_headdim_q,
+            kPadHeadDimQ,
+            pad_headdim_v,
+            kPadHeadDimV,
+            [&] {
+              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
                   kPadHeadDimQ,
                   kPadHeadDimV,
                   has_attn_bias,
                   true, // kStoreLSE
-                  false, // kHasDropout
+                  kHasDropout,
                   occupancy>;
 
               using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
 
-              using FmhaPipeline =
+              using FmhaFwdPipeline_ =
                   ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
-              using FmhaKernel = FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
+              using FmhaFwdKernel_ = FmhaFwdKernel<
+                  FmhaFwdTilePartitioner_,
+                  FmhaFwdPipeline_,
+                  FmhaFwdEpilogue_>;
 
-              RunWithKernel<FmhaKernel>(param, stream);
+              RunWithKernel<FmhaFwdKernel_>(param, stream);
             });
       };
     });
   };
 
-  template <typename FmhaKernel>
+  template <typename FmhaFwdKernel>
   static void RunWithKernel(GroupedForwardParams& param, hipStream_t stream) {
     const auto kargs = [&] {
-      return FmhaKernel::MakeKargs(
+      return FmhaFwdKernel::MakeKargs(
           param.q_ptr,
           param.k_ptr,
           param.v_ptr,
@@ -166,7 +180,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           param.v_strides[1],
           param.attn_bias_strides[1],
           0, // nhead_stride_randval
-          param.max_seqlen_q, // nhead_stride_lse
+          param.lse_strides[1],
           param.out_strides[1],
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
@@ -177,14 +191,14 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           {param.philox_seed, param.philox_offset});
     }();
 
-    dim3 kGridSize = FmhaKernel::GridSize(
+    dim3 kGridSize = FmhaFwdKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
-    constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+    constexpr dim3 kBlockSize = FmhaFwdKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaFwdKernel::kBlockPerCu;
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
-        FmhaKernel{},
+        FmhaFwdKernel{},
         kGridSize,
         kBlockSize,
         0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
index b417156f5..9789cee29 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -10,6 +10,7 @@
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_forward.h"
+#include "ck_tiled_headdim_switch.h"
 
 // clang-format off
 extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index b7c278c53..d49eaa5cc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -10,6 +10,7 @@
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_forward.h"
+#include "ck_tiled_headdim_switch.h"
 
 // clang-format off
 extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 5d2c232ba..7f2878487 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -28,6 +28,9 @@ struct BatchedInferParams {
   std::array<int, 4> out_strides;
   std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
 
+  // BHM mode strides, completely contiguous
+  std::array<int, 3> lse_strides;
+
   const void* q_ptr;
   const void* k_ptr;
   const void* v_ptr;
@@ -78,6 +81,9 @@ struct GroupedInferParams {
   // 4d tensor view [B, H, M, N]
   std::array<int, 4> attn_bias_strides;
 
+  // BHM mode strides, completely contiguous
+  std::array<int, 3> lse_strides;
+
   const void* q_ptr;
   const void* k_ptr;
   const void* v_ptr;
@@ -99,9 +105,6 @@ struct GroupedForwardParams : public GroupedInferParams {
 
   // completely contiguous
   void* logsumexp_ptr;
-
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
 };
 
 struct BatchedBackwardParams {
@@ -117,7 +120,6 @@ struct BatchedBackwardParams {
   bool has_attn_bias;
   bool bias_has_grad;
 
-  bool use_fp32_qkv_grad;
   bool is_mqa_gqa;
 
   // BMHK mode strides, last-dim contiguous
@@ -126,9 +128,13 @@ struct BatchedBackwardParams {
   std::array<int, 4> v_strides;
   std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
   std::array<int, 4> out_strides;
+  std::array<int, 4> grad_out_strides;
 
-  std::array<int, 4> tmp_grad_k_strides;
-  std::array<int, 4> tmp_grad_v_strides;
+  std::array<int, 4> grad_k_strides;
+  std::array<int, 4> grad_v_strides;
+
+  // BHM mode strides, completely contiguous
+  std::array<int, 3> lsed_strides;
 
   const void* q_ptr;
   const void* k_ptr;
@@ -138,6 +144,7 @@ struct BatchedBackwardParams {
   const void* out_ptr;
 
   uint8_t custom_mask_type;
+  int window_size; // local-attention
 
   void* grad_q_ptr;
   void* grad_k_ptr;
@@ -150,6 +157,7 @@ struct BatchedBackwardParams {
 
   // BHM mode lengths, completely contiguous
   const void* logsumexp_ptr;
+  void* dot_out_ptr;
 };
 
 struct GroupedBackwardParams {
@@ -162,16 +170,16 @@ struct GroupedBackwardParams {
   int Kv; // embed_dim for Value
 
   int max_seqlen_q;
+  int max_seqlen_k;
 
-  std::vector<int> host_seqstart_q;
-  std::vector<int> host_seqstart_k;
-  std::vector<int> host_seqlen_k;
+  void* seqstart_q_dev_ptr;
+  void* seqstart_k_dev_ptr;
+  void* seqlen_k_dev_ptr;
 
   float scale;
   bool has_attn_bias;
   bool bias_has_grad;
 
-  bool use_fp32_qkv_grad;
   bool is_mqa_gqa;
 
   // MHK mode strides, last-dim contiguous
@@ -179,37 +187,36 @@ struct GroupedBackwardParams {
   std::array<int, 3> k_strides;
   std::array<int, 3> v_strides;
   std::array<int, 3> out_strides;
+  std::array<int, 3> grad_out_strides;
   // 4d tensor view [B, H, M, N]
   std::array<int, 4> attn_bias_strides;
 
-  std::array<int, 3> tmp_grad_k_strides;
-  std::array<int, 3> tmp_grad_v_strides;
+  std::array<int, 3> grad_k_strides;
+  std::array<int, 3> grad_v_strides;
 
-  std::vector<const void*> q_ptrs;
-  std::vector<const void*> k_ptrs;
-  std::vector<const void*> v_ptrs;
-  std::vector<const void*> attn_bias_ptrs;
-  std::vector<const void*> grad_out_ptrs;
-  std::vector<const void*> out_ptrs;
+  // BHM mode strides, completely contiguous
+  std::array<int, 3> lsed_strides;
 
-  // used by the light_v2 kernel
-  // TODO use these as workspace
-  std::vector<void*> ydotdy_ptrs;
+  const void* q_ptr;
+  const void* k_ptr;
+  const void* v_ptr;
+  const void* attn_bias_ptr;
+  const void* grad_out_ptr;
+  const void* out_ptr;
 
   uint8_t custom_mask_type;
+  int window_size; // local-attention
 
-  std::vector<void*> grad_q_ptrs;
-  std::vector<void*> grad_k_ptrs;
-  std::vector<void*> grad_v_ptrs;
-  std::vector<void*> grad_bias_ptrs;
+  void* grad_q_ptr;
+  void* grad_k_ptr;
+  void* grad_v_ptr;
+  void* grad_bias_ptr;
 
   float dropout_prob;
   int64_t philox_seed;
   int64_t philox_offset;
 
   // BHM mode lengths, completely contiguous
-  std::vector<const void*> logsumexp_ptrs;
-
-  // TODO: need remove this after dev-op fix
-  std::vector<void*> randvals_ptrs;
+  const void* logsumexp_ptr;
+  void* dot_out_ptr;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
index 6de737c80..ccc8ae0ca 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
@@ -26,3 +26,19 @@
       throw std::runtime_error("Head-dim sizes not supported!");       \
     }                                                                  \
   }()
+
+#define FMHA_BWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
+  [&] {                                                                \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
+      constexpr ck::index_t CONST_NAME = 32;                           \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {                   \
+      constexpr ck::index_t CONST_NAME = 64;                           \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
+      constexpr ck::index_t CONST_NAME = 128;                          \
+      __VA_ARGS__();                                                   \
+    } else {                                                           \
+      throw std::runtime_error("Head-dim sizes not supported!");       \
+    }                                                                  \
+  }()
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..67c5b042f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..7842cc14e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..f357331c7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..ae87f436d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..27b50a8a6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..c0944682c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..3329e61b6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..2affa3ff9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..7b3c001fe
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..15b46c6e9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..29cb04307
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..9c28e4a53
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..24a39ad28
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..ebf7765ac
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..03418ee58
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..315950620
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..1ddf23a3b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..4f09b8fe1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..89066e511
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..bc7c12971
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..d53fa0dbe
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..8d2535cfb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..3754898df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..991a285c9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..343cbfcba
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..484edc279
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..5e1a6bba0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..9e93e28ea
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..84d0377ed
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..7fc71497e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..1bed5bed0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..635e9c390
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..af52c955f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..495ad8580
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..a487c5db2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..360970962
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..3547d310f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..24aeb3aee
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..e3e51ae4a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..67e153ffc
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..ec7336a51
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..13a5d40eb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..058f08c65
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..469b2d2e4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..3675cd20a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
new file mode 100644
index 000000000..0433020e0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
new file mode 100644
index 000000000..322c41f15
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
new file mode 100644
index 000000000..885e757c8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);

From 28e713d03e1a49f5154f2239514909d0067bedc6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 27 Mar 2024 16:26:40 +0000
Subject: [PATCH 490/641] Update to add dropout for fmah backward

---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h   | 17 +++++++++++++----
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h   | 16 ++++++++++++----
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 84ea5f423..a51be2f41 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -52,8 +52,8 @@ struct batched_backward_causalmask_attnbias_dispatched {
           typename FmhaBwdTypeConfig<scalar_t>::LSEDataType,
           typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
           typename FmhaBwdTypeConfig<scalar_t>::DDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaBwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaBwdTypeConfig<scalar_t>::ODataType,
           typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
           typename FmhaBwdTypeConfig<scalar_t>::QGradDataType,
@@ -180,6 +180,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.out_ptr,
           param.grad_out_ptr,
           param.dot_out_ptr,
+          1.0f - param.dropout_prob,
           param.M,
           param.Kv,
           param.grad_out_strides[1], // stride_do
@@ -219,14 +220,16 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.logsumexp_ptr,
           param.grad_out_ptr,
           param.dot_out_ptr,
+          nullptr, // rand_val_ptr
           param.grad_q_ptr,
           param.grad_k_ptr,
           param.grad_v_ptr,
           param.grad_bias_ptr,
           param.M, // seqlen_q
           param.N, // seqlen_k
-          param.Hq, // nhead_q
-          param.Hkv, // nhead_v
+          param.K,
+          param.Kv,
+          param.Hq,
           param.Hq / param.Hkv,
           param.scale,
           param.q_strides[1], // q, k, v, bias, do, o, dk, dv, dbias seq-dim
@@ -234,6 +237,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[2],
+          0, // stride_randval
           param.grad_out_strides[1],
           param.grad_k_strides[1],
           param.grad_v_strides[1],
@@ -244,6 +248,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.k_strides[2],
           param.v_strides[2],
           param.attn_bias_strides[1],
+          0, // nhead_stride_randval
           param.grad_out_strides[2],
           param.lsed_strides[1],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
@@ -253,6 +258,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[0],
+          0, // batch_stride_randval
           param.grad_out_strides[0],
           param.lsed_strides[0], // lse/dot is in BHM contiguous layout
           param.grad_k_strides[0],
@@ -260,7 +266,10 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size);
+          param.window_size,
+          param.dropout_prob, // dropout ratio
+          false, // is_store_randval
+          {param.philox_seed, param.philox_offset});
     }();
 
     dim3 kGridSize = FmhaBwdKernel::GridSize(param.B, param.Hq, param.N);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 7fab9f2c8..5220071bd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -52,8 +52,8 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           typename FmhaBwdTypeConfig<scalar_t>::LSEDataType,
           typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
           typename FmhaBwdTypeConfig<scalar_t>::DDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaBwdTypeConfig<scalar_t>::BiasDataType,
+          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
           typename FmhaBwdTypeConfig<scalar_t>::ODataType,
           typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
           typename FmhaBwdTypeConfig<scalar_t>::QGradDataType,
@@ -167,6 +167,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.out_ptr,
           param.grad_out_ptr,
           param.dot_out_ptr,
+          1.0f - param.dropout_prob,
           param.seqstart_q_dev_ptr,
           param.Kv,
           param.grad_out_strides[0], // stride_do
@@ -203,6 +204,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.logsumexp_ptr,
           param.grad_out_ptr,
           param.dot_out_ptr,
+          nullptr, // randval_ptr
           param.grad_q_ptr,
           param.grad_k_ptr,
           param.grad_v_ptr,
@@ -210,8 +212,9 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.seqstart_q_dev_ptr,
           param.seqstart_k_dev_ptr,
           param.seqlen_k_dev_ptr,
-          param.Hq, // nhead_q
-          param.Hkv, // nhead_v
+          param.K,
+          param.Kv,
+          param.Hq,
           param.Hq / param.Hkv,
           param.scale,
           param.q_strides[0], // q, k, v, bias, do, o, dk, dv, dbias seq-dim
@@ -219,6 +222,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[1],
+          0, // stride_randval
           param.grad_out_strides[0],
           param.grad_k_strides[0],
           param.grad_v_strides[0],
@@ -229,12 +233,16 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[0],
+          0, // nhead_stride_randval
           param.grad_out_strides[1],
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size);
+          param.window_size,
+          param.dropout_prob, // dropout ratio
+          false, // is_store_randval
+          {param.philox_seed, param.philox_offset});
     }();
 
     dim3 kGridSize = FmhaBwdKernel::GridSize(

From 4ef7eba711f8f0f136f39d781a92cc7a88ea35bc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 27 Mar 2024 17:23:42 +0000
Subject: [PATCH 491/641] Update in attention.cpp to align
 efficient_attention_backward_ck interface parameters

---
 xformers/csrc/attention/attention.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
index 36a9675e7..e5998de5b 100644
--- a/xformers/csrc/attention/attention.cpp
+++ b/xformers/csrc/attention/attention.cpp
@@ -48,7 +48,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, "
       " Tensor value, Tensor? seq_positions, float scale, int split_k) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale) -> (Tensor, Tensor, Tensor, Tensor)"));
+      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, int? max_seqlen_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale, int? window_size) -> (Tensor, Tensor, Tensor, Tensor)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif

From 48a5f3e757b984d046d688700947664437c48b7e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 27 Mar 2024 23:58:47 +0000
Subject: [PATCH 492/641] Enable BwdOp in ck.py

---
 xformers/ops/fmha/ck.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index aaca59113..819e9d85e 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -50,18 +50,15 @@ def _get_seqlen_info(
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         max_seqlen_q = attn_bias.q_seqinfo.max_seqlen
+        max_seqlen_k = attn_bias.k_seqinfo.max_seqlen
     else:
         seqstart_k = None
         seqstart_q = None
         max_seqlen_q = -1
+        max_seqlen_k = -1
 
-    return (
-        seqstart_k,
-        seqstart_q,
-        max_seqlen_q,
-    )
-
-
+    return seqstart_k, seqstart_q, max_seqlen_q, max_seqlen_k
+    
 def _get_tensor_bias(
     attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
 ) -> Optional[torch.Tensor]:
@@ -266,7 +263,7 @@ def apply_bmhk(
     ) -> Tuple[torch.Tensor, Optional[Context]]:
         if type(inp.attn_bias) not in FwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
-        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q, _ = _get_seqlen_info(inp)
         out, lse, rng_seed, rng_offset = cls.OPERATOR(
             query=inp.query,
             key=inp.key,
@@ -327,8 +324,6 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         requires_grad = (
             d.query.requires_grad or d.key.requires_grad or d.value.requires_grad
         )
-        if requires_grad:
-            reasons.append("Gradience is currently not supported by ck-tiled!")
         return reasons
 
     @classmethod
@@ -363,7 +358,7 @@ class BwOp(AttentionBwOpBase):
     OPERATOR = get_xformers_operator("efficient_attention_backward_ck")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES
-    SUPPORTED_MAX_K = FwOp.SUPPORTED_MAX_K
+    SUPPORTED_MAX_K = 128 
     SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
         type(None),
         torch.Tensor,
@@ -387,8 +382,8 @@ class BwOp(AttentionBwOpBase):
 
     _TEST_K: List[int] = [
         32,  # 64x64 kernel
+        64, 
         128,  # 64x128/128x128 kernel
-        256,  # 64x128 with accumulation in gmem
     ]
 
     @classmethod
@@ -423,7 +418,6 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                 )
         _check_large_shapes(reasons, d)
 
-        reasons.append("Backward is currently not supported by ck-tiled!")
         return reasons
 
     @classmethod
@@ -431,7 +425,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
         if type(inp.attn_bias) not in BwOp.SUPPORTED_ATTN_BIAS_TYPES:
             raise NotImplementedError("Unsupported attn_bias type")
 
-        seqstart_k, seqstart_q, max_seqlen_q = _get_seqlen_info(inp)
+        seqstart_k, seqstart_q, max_seqlen_q, max_seqlen_k = _get_seqlen_info(inp)
         dtype = inp.query.dtype
 
         rng_seed = rng_offset = 0
@@ -454,6 +448,7 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             seqstart_q=seqstart_q,
             seqstart_k=seqstart_k,
             max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
             seqlen_k=(
                 inp.attn_bias.k_seqinfo.seqlen
                 if isinstance(
@@ -472,6 +467,18 @@ def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
             rng_offset=rng_offset,
             custom_mask_type=_custom_mask_type(inp.attn_bias),
             scale=inp.scale,
+            window_size=(
+                inp.attn_bias._window_size
+                if isinstance(
+                    inp.attn_bias,
+                    (
+                        BlockDiagonalCausalLocalAttentionMask,
+                        BlockDiagonalCausalLocalAttentionFromBottomRightMask,
+                        LowerTriangularFromBottomRightLocalAttentionMask,
+                    ),
+                )
+                else None
+            ),
         )
 
         # c++/CUDA implementation returns an uninitialized tensor if bias doesn't

From 2e45012be83dcfdc54582ae8854fd9ea7b7adbbe Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 28 Mar 2024 00:01:40 +0000
Subject: [PATCH 493/641] Support grad_out to have different strides as out

---
 .../attention_backward_generic_ck_tiled.cpp        |  5 ++---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h      | 14 ++++++++------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h      |  8 +++++---
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 8f93269c6..065cd6484 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -82,12 +82,11 @@ efficient_attention_backward_ck(
   TORCH_CHECK(query.size(3) == key.size(3));
   TORCH_CHECK(value.size(3) == grad_out.size(3));
 
-  // CK-FlashAttn requires out, grad_out to have same shapes
   TORCH_CHECK(out.sizes() == grad_out.sizes());
 
   // last dim is contiguous, device is CUDA
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(out);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
+  // CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(grad_out);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
   CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
@@ -295,7 +294,7 @@ efficient_attention_backward_ck(
       if (bias_requires_grad)
         p.grad_bias_ptr = grad_bias.data_ptr();
     } else {
-      p.has_attn_bias = true;
+      p.has_attn_bias = false;
       p.attn_bias_ptr = nullptr;
       p.grad_bias_ptr = nullptr;
     }
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index a51be2f41..a104ce4c7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -185,12 +185,13 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.Kv,
           param.grad_out_strides[1], // stride_do
           param.out_strides[1], // stride_o
-          param.out_strides[2], // nhead_stride_do
+          param.grad_out_strides[2], // nhead_stride_do
           param.out_strides[2], // nhead_stride_o
           param.lsed_strides[1], // nhead_stride_d
-          param.out_strides[0], // batch_stride_do
+          param.grad_out_strides[0], // batch_stride_do
           param.out_strides[0], // batch_stride_o
-          param.lsed_strides[0]); // batch_stride_d
+          param.lsed_strides[0], // batch_stride_d
+          param.grad_out_strides[3]); // hdim_stride_do
     }();
 
     dim3 kGridSize =
@@ -232,7 +233,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.Hq,
           param.Hq / param.Hkv,
           param.scale,
-          param.q_strides[1], // q, k, v, bias, do, o, dk, dv, dbias seq-dim
+          param.q_strides[1], // q, k, v, bias, do, dk, dv, dbias seq-dim
                               // stride
           param.k_strides[1],
           param.v_strides[1],
@@ -243,7 +244,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.grad_v_strides[1],
           param.attn_bias_strides[2], // assume grad_bias has same strides as
                                       // bias
-          param.q_strides[2], // q, k, v, bias, do, o, lse/dot, dbias
+          param.q_strides[2], // q, k, v, bias, do, lse/dot, dbias
                               // nhead-dim strides
           param.k_strides[2],
           param.v_strides[2],
@@ -253,7 +254,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.lsed_strides[1],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
                                       // bias
-          param.q_strides[0], // q, k, v, bias, do, o, lse/dot, dk, dv, dbias,
+          param.q_strides[0], // q, k, v, bias, do, lse/dot, dk, dv, dbias,
                               // batch-dim strides
           param.k_strides[0],
           param.v_strides[0],
@@ -265,6 +266,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.grad_v_strides[0],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
+          param.grad_out_strides[3], // hdim_stride_do
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
           param.dropout_prob, // dropout ratio
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 5220071bd..9587f2d17 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -174,7 +174,8 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.out_strides[0], // stride_o
           param.grad_out_strides[1], // nhead_stride_do
           param.out_strides[1], // nhead_stride_o
-          param.lsed_strides[1]);
+          param.lsed_strides[1],
+          param.grad_out_strides[2]); // hdim_stride_do
     }();
 
     dim3 kGridSize = FmhaBwdOGradDotOKernel::GridSize(
@@ -217,7 +218,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.Hq,
           param.Hq / param.Hkv,
           param.scale,
-          param.q_strides[0], // q, k, v, bias, do, o, dk, dv, dbias seq-dim
+          param.q_strides[0], // q, k, v, bias, do, dk, dv, dbias seq-dim
                               // stride
           param.k_strides[0],
           param.v_strides[0],
@@ -228,7 +229,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.grad_v_strides[0],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
                                       // bias
-          param.q_strides[1], // q, k, v, bias, do, o, lse/dot, dbias
+          param.q_strides[1], // q, k, v, bias, do, lse/dot, dbias
                               // nhead-dim strides
           param.k_strides[1],
           param.v_strides[1],
@@ -238,6 +239,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
+          param.grad_out_strides[2], // hdim_stride_do
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
           param.dropout_prob, // dropout ratio

From 566d26ff8009bf27535fa0798763fd1fdb271087 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 29 Mar 2024 16:38:21 +0000
Subject: [PATCH 494/641] Force seqstart_q/seqstart_k to be in device memory in
 ck.py

---
 xformers/ops/fmha/ck.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 819e9d85e..00aa1b02b 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -47,6 +47,8 @@ def _get_seqlen_info(
     if isinstance(
         attn_bias, (BlockDiagonalMask, BlockDiagonalCausalWithOffsetPaddedKeysMask)
     ):
+        attn_bias.k_seqinfo.to(inp.query.device)
+        attn_bias.q_seqinfo.to(inp.query.device)
         seqstart_k = attn_bias.k_seqinfo.seqstart
         seqstart_q = attn_bias.q_seqinfo.seqstart
         max_seqlen_q = attn_bias.q_seqinfo.max_seqlen

From fc6c4a678319181de4f8b7ef91747aabd22d89e8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 29 Mar 2024 16:59:28 +0000
Subject: [PATCH 495/641] Remove duplicated codes in
 ck_tiled_fmha_grouped_forward.h/infer.h

---
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 97 ++++++-------------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 76 +++++----------
 2 files changed, 54 insertions(+), 119 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 548cd013d..43c9d0cc4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -79,72 +79,37 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           !(param.K % FmhaFwdShape_::kK0BlockLength == 0);
       const bool pad_headdim_v = !(param.Kv % FmhaFwdShape_::kN1 == 0);
 
-      if constexpr (MaxK == 256) {
-        BOOL_SWITCH_3(
-            has_dropout,
-            kHasDropout,
-            pad_headdim_q,
-            kPadHeadDimQ,
-            pad_headdim_v,
-            kPadHeadDimV,
-            [&] {
-              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  true, // kStoreLSE
-                  kHasDropout,
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
-
-              using FmhaFwdPipeline_ =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
-
-              using FmhaFwdKernel_ = FmhaFwdKernel<
-                  FmhaFwdTilePartitioner_,
-                  FmhaFwdPipeline_,
-                  FmhaFwdEpilogue_>;
-
-              RunWithKernel<FmhaFwdKernel_>(param, stream);
-            });
-      } else {
-        BOOL_SWITCH_3(
-            has_dropout,
-            kHasDropout,
-            pad_headdim_q,
-            kPadHeadDimQ,
-            pad_headdim_v,
-            kPadHeadDimV,
-            [&] {
-              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  true, // kStoreLSE
-                  kHasDropout,
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
-
-              using FmhaFwdPipeline_ =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
-              using FmhaFwdKernel_ = FmhaFwdKernel<
-                  FmhaFwdTilePartitioner_,
-                  FmhaFwdPipeline_,
-                  FmhaFwdEpilogue_>;
-
-              RunWithKernel<FmhaFwdKernel_>(param, stream);
-            });
-      };
+      BOOL_SWITCH_3(
+          has_dropout,
+          kHasDropout,
+          pad_headdim_q,
+          kPadHeadDimQ,
+          pad_headdim_v,
+          kPadHeadDimV,
+          [&] {
+            using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
+                kPadSeqLenQ,
+                kPadSeqLenK,
+                kPadHeadDimQ,
+                kPadHeadDimV,
+                has_attn_bias,
+                true, // kStoreLSE
+                kHasDropout,
+                occupancy>;
+
+            using FmhaPipelineProblem =
+                FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
+
+            using FmhaFwdPipeline_ =
+                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                    FmhaPipelineProblem>;
+            using FmhaFwdKernel_ = FmhaFwdKernel<
+                FmhaFwdTilePartitioner_,
+                FmhaFwdPipeline_,
+                FmhaFwdEpilogue_>;
+
+            RunWithKernel<FmhaFwdKernel_>(param, stream);
+          });
     });
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 05975f84f..deb2c1bd7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -78,59 +78,29 @@ struct grouped_infer_causalmask_attnbias_dispatched {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
-      if constexpr (MaxK == 256) {
-        BOOL_SWITCH_2(
-            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  false, // kStoreLSE
-                  false, // kHasDropout
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-              using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
-              using FmhaKernel = FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
-
-              RunWithKernel<FmhaKernel>(param, stream);
-            });
-      } else {
-        BOOL_SWITCH_2(
-            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  false, // kStoreLSE
-                  false, // kHasDropout
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-              using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
-              using FmhaKernel = FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
-
-              RunWithKernel<FmhaKernel>(param, stream);
-            });
-      };
+      BOOL_SWITCH_2(
+          pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+            using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                kPadSeqLenQ,
+                kPadSeqLenK,
+                kPadHeadDimQ,
+                kPadHeadDimV,
+                has_attn_bias,
+                false, // kStoreLSE
+                false, // kHasDropout
+                occupancy>;
+
+            using FmhaPipelineProblem =
+                FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+            using FmhaPipeline =
+                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                    FmhaPipelineProblem>;
+            using FmhaKernel =
+                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+            RunWithKernel<FmhaKernel>(param, stream);
+          });
     });
   };
 

From ff0db0736ebbcfee5bec09f30ac992eacb930347 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 29 Mar 2024 22:24:51 +0000
Subject: [PATCH 496/641] Use optimized async pipeline where 8x headdim length
 is assumed

---
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 79 +++++++-----------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 82 +++++++------------
 2 files changed, 58 insertions(+), 103 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 617ebd762..3a7427993 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -81,9 +81,12 @@ struct batched_forward_causalmask_attnbias_dispatched {
 
       // usually headdim_q and headdim_v are same, consider them together to
       // determine whether to do padding saving some compiling time
-      bool pad_headdim = (pad_headdim_q || pad_headdim_v);
+      const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
-      if constexpr (MaxK == 256) {
+      const bool use_async_pipeline =
+          ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+
+      if (!use_async_pipeline) {
         BOOL_SWITCH_4(
             has_dropout,
             kHasDropout,
@@ -119,54 +122,30 @@ struct batched_forward_causalmask_attnbias_dispatched {
               RunWithKernel<FmhaFwdKernel_>(param, stream);
             });
       } else {
-        BOOL_SWITCH_4(
-            has_dropout,
-            kHasDropout,
-            pad_seqlen_q,
-            kPadSeqLenQ,
-            pad_seqlen_k,
-            kPadSeqLenK,
-            pad_headdim,
-            kPadHeadDim,
-            [&] {
-              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDim, // kPadHeadDimQ
-                  kPadHeadDim, // kPadHeadDimV
-                  has_attn_bias,
-                  true, // kStoreLSE
-                  kHasDropout,
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
-
-              constexpr bool no_any_padding =
-                  !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDim);
-
-              if constexpr (no_any_padding) {
-                using FmhaFwdPipeline_ =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                        FmhaPipelineProblem>;
-                using FmhaFwdKernel_ = FmhaFwdKernel<
-                    FmhaFwdTilePartitioner_,
-                    FmhaFwdPipeline_,
-                    FmhaFwdEpilogue_>;
-
-                RunWithKernel<FmhaFwdKernel_>(param, stream);
-              } else {
-                using FmhaFwdPipeline_ =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                        FmhaPipelineProblem>;
-                using FmhaFwdKernel_ = FmhaFwdKernel<
-                    FmhaFwdTilePartitioner_,
-                    FmhaFwdPipeline_,
-                    FmhaFwdEpilogue_>;
-
-                RunWithKernel<FmhaFwdKernel_>(param, stream);
-              };
-            });
+        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK, [&] {
+          using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
+              true, // kPadSeqLenQ,
+              kPadSeqLenK,
+              true, // kPadHeadDimQ
+              true, // kPadHeadDimV
+              has_attn_bias,
+              true, // kStoreLSE
+              kHasDropout,
+              occupancy>;
+
+          using FmhaPipelineProblem =
+              FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
+
+          using FmhaFwdPipeline_ =
+              ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                  FmhaPipelineProblem>;
+          using FmhaFwdKernel_ = FmhaFwdKernel<
+              FmhaFwdTilePartitioner_,
+              FmhaFwdPipeline_,
+              FmhaFwdEpilogue_>;
+
+          RunWithKernel<FmhaFwdKernel_>(param, stream);
+        });
       };
     });
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 4e9286a75..bc94ce6e2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -73,12 +73,15 @@ struct batched_infer_causalmask_attnbias_dispatched {
       constexpr ck::index_t occupancy =
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
-      bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
-      bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
-      bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
-      bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
+      const bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
+      const bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
+      const bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
+      const bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 
-      if constexpr (MaxK == 256) {
+      const bool use_async_pipeline =
+          ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+
+      if (!use_async_pipeline) {
         BOOL_SWITCH_4(
             pad_seqlen_q,
             kPadSeqLenQ,
@@ -113,54 +116,27 @@ struct batched_infer_causalmask_attnbias_dispatched {
               RunWithKernel<FmhaKernel>(param, stream);
             });
       } else {
-        BOOL_SWITCH_4(
-            pad_seqlen_q,
-            kPadSeqLenQ,
-            pad_seqlen_k,
-            kPadSeqLenK,
-            pad_headdim_q,
-            kPadHeadDimQ,
-            pad_headdim_v,
-            kPadHeadDimV,
-            [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  false, // kStoreLSE
-                  false, // kHasDropout
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-              constexpr bool no_any_padding =
-                  !(kPadSeqLenQ || kPadSeqLenK || kPadHeadDimQ || kPadHeadDimV);
-
-              if constexpr (no_any_padding) {
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                        FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<
-                    FmhaTilePartitioner,
-                    FmhaPipeline,
-                    FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-              } else {
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                        FmhaPipelineProblem>;
-                using FmhaKernel = FmhaFwdKernel<
-                    FmhaTilePartitioner,
-                    FmhaPipeline,
-                    FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-              };
-            });
+        BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK, [&] {
+          using FmhaTraits = ck::tile_program::TileFmhaTraits<
+              true, // kPadSeqLenQ,
+              kPadSeqLenK,
+              true, // kPadHeadDimQ,
+              true, // kPadHeadDimV,
+              has_attn_bias,
+              false, // kStoreLSE
+              false, // kHasDropout
+              occupancy>;
+
+          using FmhaPipelineProblem =
+              FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+          using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+              FmhaPipelineProblem>;
+          using FmhaKernel =
+              FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+          RunWithKernel<FmhaKernel>(param, stream);
+        });
       };
     });
   };

From 0f4a1712422686ccf57536f927b9fa2d4f0629ee Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 30 Mar 2024 13:35:56 +0000
Subject: [PATCH 497/641] Fix in batched_infer

---
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index bc94ce6e2..294e04483 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -130,7 +130,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
           using FmhaPipelineProblem =
               FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
-          using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+          using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
               FmhaPipelineProblem>;
           using FmhaKernel =
               FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;

From 0d6b915822b7cbf080c38d52eae9164398a7ff8d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 1 Apr 2024 15:39:59 +0000
Subject: [PATCH 498/641] Update to track ck_tile/opt_padding_fa_train_xformers
 branch

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 7b6cfaab8..8d80ded0b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel-internal.git
-	branch = ck_tile/dev
+	branch = ck_tile/opt_padding_fa_train_xformers
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 0e533488d..b9cb68ea5 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 0e533488daa13cceb4c61dfa150aad9fd895fa63
+Subproject commit b9cb68ea5f7a0869a6c6be86723f2fe44d35568d

From df435593343d2d0ef99ee2a1b26abf67b04c2d86 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Apr 2024 09:25:34 -0700
Subject: [PATCH 499/641] Update rocm_ci.yml

configuring the self-hosted runner
---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index f2593d53a..03f3d3d87 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   build:
     if: contains(github.event.label.name, 'rocm')
-    runs-on: rocm
+    runs-on: self-hosted
 
     steps:
     - uses: actions/checkout@v2

From 47135760eb1017bc05c18552594b60a5f0af40ff Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 1 Apr 2024 16:43:59 +0000
Subject: [PATCH 500/641] Update to use the newer FmhaFwdEpilogue

---
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 17 ++++++++++----
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 23 ++++++++++++++-----
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index b9cb68ea5..ea5cc2b6f 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit b9cb68ea5f7a0869a6c6be86723f2fe44d35568d
+Subproject commit ea5cc2b6f7225ca25b970d21463f5dfc7b561c0e
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 3a7427993..60d18440f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -35,10 +35,6 @@ template <
     bool has_attn_bias,
     ck::index_t MaxK>
 struct batched_forward_causalmask_attnbias_dispatched {
-  using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -114,6 +110,12 @@ struct batched_forward_causalmask_attnbias_dispatched {
                   ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
 
+              using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                  typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                  typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                  kPadSeqLenQ,
+                  kPadHeadDim>>;
+
               using FmhaFwdKernel_ = FmhaFwdKernel<
                   FmhaFwdTilePartitioner_,
                   FmhaFwdPipeline_,
@@ -139,6 +141,13 @@ struct batched_forward_causalmask_attnbias_dispatched {
           using FmhaFwdPipeline_ =
               ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
                   FmhaPipelineProblem>;
+
+          using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+              typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+              typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+              true,
+              true>>;
+
           using FmhaFwdKernel_ = FmhaFwdKernel<
               FmhaFwdTilePartitioner_,
               FmhaFwdPipeline_,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 294e04483..edb132db1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -36,10 +36,6 @@ template <
     bool has_attn_bias,
     ck::index_t MaxK>
 struct batched_infer_causalmask_attnbias_dispatched {
-  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -108,6 +104,13 @@ struct batched_infer_causalmask_attnbias_dispatched {
               using FmhaPipeline =
                   ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                       FmhaPipelineProblem>;
+
+              using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                  typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                  typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                  kPadSeqLenQ,
+                  kPadHeadDimV>>;
+
               using FmhaKernel = FmhaFwdKernel<
                   FmhaTilePartitioner,
                   FmhaPipeline,
@@ -130,8 +133,16 @@ struct batched_infer_causalmask_attnbias_dispatched {
           using FmhaPipelineProblem =
               FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
-          using FmhaPipeline = ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-              FmhaPipelineProblem>;
+          using FmhaPipeline =
+              ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                  FmhaPipelineProblem>;
+
+          using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+              typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+              typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+              true,
+              true>>;
+
           using FmhaKernel =
               FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 

From a745c45f134a8b73355711a7c2eef18655edb100 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Apr 2024 10:08:05 -0700
Subject: [PATCH 501/641] Update rocm_ci.yml

add option to manually trigger workflow
---
 .github/workflows/rocm_ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 03f3d3d87..894c36a8d 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -3,6 +3,12 @@ name: ROCM_CI
 on: 
   pull_request:
     types: [labeled, synchronize, reopened]
+  workflow_dispatch:
+    inputs:
+      logLevel:
+        description: 'Log level'     
+        required: true
+        default: 'warning'
 
 jobs:
   build:

From 95d0260a3a353d7ec5cd7aff4e6391307d05aad4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Apr 2024 10:27:00 -0700
Subject: [PATCH 502/641] Update rocm_ci.yml

remove condition which skips ci unless github event contains string 'rocm'
---
 .github/workflows/rocm_ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 894c36a8d..eb5d406c7 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -12,7 +12,6 @@ on:
 
 jobs:
   build:
-    if: contains(github.event.label.name, 'rocm')
     runs-on: self-hosted
 
     steps:

From 4069efe3252a15af9b875e037dc8ef4e34cbe234 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Apr 2024 21:14:41 +0000
Subject: [PATCH 503/641] copy rocm_ci workflow from main branch

---
 .github/workflows/rocm_ci.yml | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index eb5d406c7..5a883e8c8 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -13,9 +13,16 @@ on:
 jobs:
   build:
     runs-on: self-hosted
-
+    container:
+      image: 'rocm/pytorch-nightly:latest'
+      options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G '
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
+      with:
+        path: '_xformers'
+        submodules: 'recursive'
+        set-safe-directory: true
+        fetch-depth: 0
     - name: Get CPU info on Ubuntu
       if: contains(runner.os, 'linux')
       run: |
@@ -47,28 +54,27 @@ jobs:
         rocm-smi
         rocminfo | grep "gfx"
 
+        python3 -VV
+
     - name: Build XFormers
       run: |
-        git clone --recursive -b $GIT_BRANCH $GITHUB_REPOSITORY
-        docker run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G -v $PWD/xformers:/xformers  rocm/pytorch-nightly:latest
-
         pip3 install --upgrade pip
         pip3 uninstall -y xformers
-        MAX_JOBS=$MAX_JOBS pip3 install -e /xformers --verbose
+        MAX_JOBS=$MAX_JOBS pip3 install -e ./_xformers --verbose
         pip3 install scipy==1.10
 
-        python3 -c "import torch; print(torch.__version__)"
+        python3 -c "import torch; print(f'PyTorch version {torch.__version__}')"
         python3 -m xformers.info
 
     - name: Run python tests
       run: |
-        pytest -rpfs /xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
+        pytest -rpfs ./_xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
     
     - name: Archive logs
       uses: actions/upload-artifact@v3
       with:
         name: test results
-        path: test_mem_eff_attention_ck.log
+        path: test_mem_eff_attention.log
 
     - name: Process test results
       run: |

From 724354cc70f557eb37fa268adce0b8743735aef5 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Apr 2024 15:45:27 -0700
Subject: [PATCH 504/641] Update rocm_ci.yml

Bump upload-artifact version
---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 5a883e8c8..8e3965777 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -71,7 +71,7 @@ jobs:
         pytest -rpfs ./_xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
     
     - name: Archive logs
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: test results
         path: test_mem_eff_attention.log

From b1a1009e95481835e25295c470e6763752ccab5a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 2 Apr 2024 00:00:24 +0000
Subject: [PATCH 505/641] Update to use the newer FmhaFwdEpilogue for grouped
 infer/forward

---
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h          | 11 +++++++----
 .../attention/hip_fmha/ck_tiled_fmha_grouped_infer.h  | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 43c9d0cc4..37e9210c9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -34,10 +34,6 @@ template <
     bool has_attn_bias,
     ck::index_t MaxK>
 struct grouped_forward_causalmask_attnbias_dispatched {
-  using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -103,6 +99,13 @@ struct grouped_forward_causalmask_attnbias_dispatched {
             using FmhaFwdPipeline_ =
                 ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                     FmhaPipelineProblem>;
+
+            using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                kPadSeqLenQ,
+                kPadHeadDimV>>;
+
             using FmhaFwdKernel_ = FmhaFwdKernel<
                 FmhaFwdTilePartitioner_,
                 FmhaFwdPipeline_,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index deb2c1bd7..7c09e2659 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -35,10 +35,6 @@ template <
     bool has_attn_bias,
     ck::index_t MaxK>
 struct grouped_infer_causalmask_attnbias_dispatched {
-  using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-      typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-      typename FmhaFwdTypeConfig<scalar_t>::ODataType>>;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -96,6 +92,13 @@ struct grouped_infer_causalmask_attnbias_dispatched {
             using FmhaPipeline =
                 ck::tile_program::block::BlockFmhaPipelineQRKSVS<
                     FmhaPipelineProblem>;
+
+            using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                kPadSeqLenQ,
+                kPadHeadDimV>>;
+
             using FmhaKernel =
                 FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 

From 97e4e20d5ee30f02774bf27c2f998e05583f491c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 3 Apr 2024 18:41:58 +0000
Subject: [PATCH 506/641] Temporarily disable the using of QRKSVSAsync()
 pipeline

---
 third_party/composable_kernel_tiled           |   2 +-
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 147 +++++++++---------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 145 ++++++++---------
 3 files changed, 147 insertions(+), 147 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index ea5cc2b6f..bf1fa3c9f 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit ea5cc2b6f7225ca25b970d21463f5dfc7b561c0e
+Subproject commit bf1fa3c9feb9bf196f27308c76a855adc47fc5e2
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 60d18440f..1ee6178ff 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -82,80 +82,79 @@ struct batched_forward_causalmask_attnbias_dispatched {
       const bool use_async_pipeline =
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
-      if (!use_async_pipeline) {
-        BOOL_SWITCH_4(
-            has_dropout,
-            kHasDropout,
-            pad_seqlen_q,
-            kPadSeqLenQ,
-            pad_seqlen_k,
-            kPadSeqLenK,
-            pad_headdim,
-            kPadHeadDim,
-            [&] {
-              using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDim, // kPadHeadDimQ
-                  kPadHeadDim, // kPadHeadDimV
-                  has_attn_bias,
-                  true, // kStoreLSE
-                  kHasDropout,
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
-
-              using FmhaFwdPipeline_ =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
-
-              using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                  typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                  typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-                  kPadSeqLenQ,
-                  kPadHeadDim>>;
-
-              using FmhaFwdKernel_ = FmhaFwdKernel<
-                  FmhaFwdTilePartitioner_,
-                  FmhaFwdPipeline_,
-                  FmhaFwdEpilogue_>;
-
-              RunWithKernel<FmhaFwdKernel_>(param, stream);
-            });
-      } else {
-        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK, [&] {
-          using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
-              true, // kPadSeqLenQ,
-              kPadSeqLenK,
-              true, // kPadHeadDimQ
-              true, // kPadHeadDimV
-              has_attn_bias,
-              true, // kStoreLSE
-              kHasDropout,
-              occupancy>;
-
-          using FmhaPipelineProblem =
-              FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
-
-          using FmhaFwdPipeline_ =
-              ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                  FmhaPipelineProblem>;
-
-          using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-              typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-              typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-              true,
-              true>>;
-
-          using FmhaFwdKernel_ = FmhaFwdKernel<
-              FmhaFwdTilePartitioner_,
-              FmhaFwdPipeline_,
-              FmhaFwdEpilogue_>;
-
-          RunWithKernel<FmhaFwdKernel_>(param, stream);
-        });
-      };
+      /*      if (!use_async_pipeline) { */
+      BOOL_SWITCH_4(
+          has_dropout,
+          kHasDropout,
+          pad_seqlen_q,
+          kPadSeqLenQ,
+          pad_seqlen_k,
+          kPadSeqLenK,
+          pad_headdim,
+          kPadHeadDim,
+          [&] {
+            using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
+                kPadSeqLenQ,
+                kPadSeqLenK,
+                kPadHeadDim, // kPadHeadDimQ
+                kPadHeadDim, // kPadHeadDimV
+                has_attn_bias,
+                true, // kStoreLSE
+                kHasDropout,
+                occupancy>;
+
+            using FmhaPipelineProblem =
+                FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
+
+            using FmhaFwdPipeline_ =
+                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                    FmhaPipelineProblem>;
+
+            using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                kPadSeqLenQ,
+                kPadHeadDim>>;
+
+            using FmhaFwdKernel_ = FmhaFwdKernel<
+                FmhaFwdTilePartitioner_,
+                FmhaFwdPipeline_,
+                FmhaFwdEpilogue_>;
+
+            RunWithKernel<FmhaFwdKernel_>(param, stream);
+          });
+      /*
+            } else {
+              BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK,
+         [&] { using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits< true, //
+         kPadSeqLenQ, kPadSeqLenK, true, // kPadHeadDimQ true, // kPadHeadDimV
+                    has_attn_bias,
+                    true, // kStoreLSE
+                    kHasDropout,
+                    occupancy>;
+
+                using FmhaPipelineProblem =
+                    FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
+
+                using FmhaFwdPipeline_ =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                        FmhaPipelineProblem>;
+
+                using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                    typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                    typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                    true,
+                    true>>;
+
+                using FmhaFwdKernel_ = FmhaFwdKernel<
+                    FmhaFwdTilePartitioner_,
+                    FmhaFwdPipeline_,
+                    FmhaFwdEpilogue_>;
+
+                RunWithKernel<FmhaFwdKernel_>(param, stream);
+              });
+            };
+      */
     });
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index edb132db1..840cd349d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -77,78 +77,79 @@ struct batched_infer_causalmask_attnbias_dispatched {
       const bool use_async_pipeline =
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
-      if (!use_async_pipeline) {
-        BOOL_SWITCH_4(
-            pad_seqlen_q,
-            kPadSeqLenQ,
-            pad_seqlen_k,
-            kPadSeqLenK,
-            pad_headdim_q,
-            kPadHeadDimQ,
-            pad_headdim_v,
-            kPadHeadDimV,
-            [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  false, // kStoreLSE
-                  false, // kHasDropout
-                  occupancy>;
-
-              using FmhaPipelineProblem =
-                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-              using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
-
-              using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                  typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                  typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-                  kPadSeqLenQ,
-                  kPadHeadDimV>>;
-
-              using FmhaKernel = FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
-
-              RunWithKernel<FmhaKernel>(param, stream);
-            });
-      } else {
-        BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK, [&] {
-          using FmhaTraits = ck::tile_program::TileFmhaTraits<
-              true, // kPadSeqLenQ,
-              kPadSeqLenK,
-              true, // kPadHeadDimQ,
-              true, // kPadHeadDimV,
-              has_attn_bias,
-              false, // kStoreLSE
-              false, // kHasDropout
-              occupancy>;
-
-          using FmhaPipelineProblem =
-              FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-          using FmhaPipeline =
-              ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                  FmhaPipelineProblem>;
-
-          using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-              typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-              typename FmhaFwdTypeConfig<scalar_t>::ODataType,
-              true,
-              true>>;
-
-          using FmhaKernel =
-              FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-          RunWithKernel<FmhaKernel>(param, stream);
-        });
-      };
+      /*     if (!use_async_pipeline) { */
+      BOOL_SWITCH_4(
+          pad_seqlen_q,
+          kPadSeqLenQ,
+          pad_seqlen_k,
+          kPadSeqLenK,
+          pad_headdim_q,
+          kPadHeadDimQ,
+          pad_headdim_v,
+          kPadHeadDimV,
+          [&] {
+            using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                kPadSeqLenQ,
+                kPadSeqLenK,
+                kPadHeadDimQ,
+                kPadHeadDimV,
+                has_attn_bias,
+                false, // kStoreLSE
+                false, // kHasDropout
+                occupancy>;
+
+            using FmhaPipelineProblem =
+                FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+            using FmhaPipeline =
+                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                    FmhaPipelineProblem>;
+
+            using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                kPadSeqLenQ,
+                kPadHeadDimV>>;
+
+            using FmhaKernel =
+                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+            RunWithKernel<FmhaKernel>(param, stream);
+          });
+      /*
+            } else {
+              BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK, [&] {
+                using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                    true, // kPadSeqLenQ,
+                    kPadSeqLenK,
+                    true, // kPadHeadDimQ,
+                    true, // kPadHeadDimV,
+                    has_attn_bias,
+                    false, // kStoreLSE
+                    false, // kHasDropout
+                    occupancy>;
+
+                using FmhaPipelineProblem =
+                    FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+                using FmhaPipeline =
+                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                        FmhaPipelineProblem>;
+
+                using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                    typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
+                    typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                    true,
+                    true>>;
+
+                using FmhaKernel =
+                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline,
+         FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              });
+            };
+      */
     });
   };
 

From e98877add282d8bc410a34936ee0027f6b418f6f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Apr 2024 15:19:46 -0700
Subject: [PATCH 507/641] Update rocm_ci.yml

add a daily run
---
 .github/workflows/rocm_ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 8e3965777..fc6946a9c 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -9,6 +9,8 @@ on:
         description: 'Log level'     
         required: true
         default: 'warning'
+  schedule:
+    - cron: "15 1 * * *"
 
 jobs:
   build:

From 6fbd05ddd4f277a4722b76280d46256cd49c7ab3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 3 Apr 2024 23:45:43 +0000
Subject: [PATCH 508/641] Implement the ck_rand_uniform interface for
 generating random number tensor

---
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/attention_ck_rand_uniform.cpp    | 99 +++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index bf1fa3c9f..132bd39f0 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit bf1fa3c9feb9bf196f27308c76a855adc47fc5e2
+Subproject commit 132bd39f02b7f5a04f9619c7dfd28efe9931971c
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
new file mode 100644
index 000000000..3933b6c5e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/core/TensorOptions.h>
+#include <torch/library.h>
+#include <torch/types.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+
+#include <ck/ck.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
+
+#include "ck_tiled_fmha_rand_uniform_kernel.hpp"
+
+namespace {
+
+/**
+ * generate a tensor with random uniform values. only used for testing, not much
+ * attention is paid to performance
+ */
+at::Tensor rand_uniform_int(
+    double dropout_prob,
+    const at::Tensor& out_pattern) // [Batches, num_head, query_len, key_len]
+{
+  int B = out_pattern.size(0);
+  int num_heads = out_pattern.size(1);
+  int M = out_pattern.size(2);
+  int N = out_pattern.size(3);
+
+  // at::cuda::CUDAGuard device_guard(out_pattern.device());
+  hipStream_t stream = at::cuda::getCurrentHIPStream().stream();
+
+  at::CUDAGeneratorImpl* gen =
+      at::get_generator_or_default<at::CUDAGeneratorImpl>(
+          c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+  at::PhiloxCudaState rng_engine_inputs;
+  {
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+  }
+
+  const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
+
+  int64_t philox_seed = std::get<0>(seeds);
+  int64_t philox_offset = std::get<1>(seeds);
+
+  at::Tensor randvals;
+
+  randvals = at::empty(
+      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
+
+  {
+    using FmhaRandUniformKernel_ =
+        FmhaRandUniformKernel<128, 64, 32, int32_t, false>;
+
+    const auto kargs = FmhaRandUniformKernel_::MakeKargs(
+        randvals.data_ptr(),
+        M,
+        N,
+        num_heads,
+        B,
+        static_cast<int>(randvals.stride(2)),
+        static_cast<int>(randvals.stride(3)),
+        static_cast<int>(randvals.stride(1)),
+        static_cast<int>(randvals.stride(0)),
+        {philox_seed, philox_offset});
+
+    dim3 kGridSize = FmhaRandUniformKernel_::GridSize(B, num_heads, M, N);
+    constexpr dim3 kBlockSize = FmhaRandUniformKernel_::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaRandUniformKernel_::kBlockPerCu;
+
+    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
+        StreamConfig{stream, false},
+        FmhaRandUniformKernel_{},
+        kGridSize,
+        kBlockSize,
+        0,
+        kargs);
+  }
+
+  (void)hipStreamSynchronize(stream);
+
+  return randvals;
+} // namespace
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"),
+      TORCH_FN(rand_uniform_int));
+}

From 2ef3b3fb45314b9533546ca7491f45f0978e21ee Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 7 Apr 2024 14:11:13 +0000
Subject: [PATCH 509/641] Add dropout to the infer path (needed by xformers
 test_dropout)

---
 .../attention_forward_generic_ck_tiled.cpp    | 12 +++------
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 25 +++++++++++--------
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  2 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 15 ++++++++---
 .../attention/hip_fmha/ck_tiled_fmha_params.h |  1 -
 xformers/ops/fmha/ck.py                       |  2 +-
 7 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 88e195c2d..48d37357b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -201,13 +201,12 @@ efficient_attention_forward_ck(
     p.window_size =
         window_size.has_value() ? (*window_size > 0 ? *window_size : 0) : 0;
 
-    p.use_dropout = use_dropout;
     p.philox_seed = philox_seed;
     p.philox_offset = philox_offset;
     p.compute_logsumexp = compute_logsumexp;
 
     // the following parameters are only used by training forward
-    if (p.use_dropout) {
+    if (use_dropout) {
       p.dropout_prob = static_cast<float>(dropout_p);
     } else
       p.dropout_prob = 0.0f;
@@ -335,13 +334,12 @@ efficient_attention_forward_ck(
     } else
       p.seqlen_k_dev_ptr = nullptr;
 
-    p.use_dropout = use_dropout;
     p.philox_seed = philox_seed;
     p.philox_offset = philox_offset;
     p.compute_logsumexp = compute_logsumexp;
 
     // the following parameters are only used by training forward
-    if (p.use_dropout) {
+    if (use_dropout) {
       p.dropout_prob = static_cast<float>(dropout_p);
     } else
       p.dropout_prob = 0.0f;
@@ -367,8 +365,7 @@ efficient_attention_forward_ck(
 
     set_batched_forward_params(batched_forward_params);
 
-    if (!batched_forward_params.use_dropout &&
-        !batched_forward_params.compute_logsumexp) {
+    if (!batched_forward_params.compute_logsumexp) {
       if (inDataType == at::ScalarType::Half) {
         batched_infer_fp16(batched_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
@@ -388,8 +385,7 @@ efficient_attention_forward_ck(
 
     set_grouped_forward_params(grouped_forward_params);
 
-    if (!grouped_forward_params.use_dropout &&
-        !grouped_forward_params.compute_logsumexp) {
+    if (!grouped_forward_params.compute_logsumexp) {
       if (inDataType == at::ScalarType::Half) {
         grouped_infer_fp16(grouped_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 1ee6178ff..251ee9fbf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -203,7 +203,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
           param.window_size,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
-          param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+          param.dropout_prob, // dropout ratio
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 840cd349d..6e448fd3f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -60,6 +60,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
@@ -74,28 +75,32 @@ struct batched_infer_causalmask_attnbias_dispatched {
       const bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
       const bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 
+      // usually headdim_q and headdim_v are same, consider them together to
+      // determine whether to do padding saving some compiling time
+      const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
+
       const bool use_async_pipeline =
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
       /*     if (!use_async_pipeline) { */
       BOOL_SWITCH_4(
+          has_dropout,
+          kHasDropout,
           pad_seqlen_q,
           kPadSeqLenQ,
           pad_seqlen_k,
           kPadSeqLenK,
-          pad_headdim_q,
-          kPadHeadDimQ,
-          pad_headdim_v,
-          kPadHeadDimV,
+          pad_headdim,
+          kPadHeadDim,
           [&] {
             using FmhaTraits = ck::tile_program::TileFmhaTraits<
                 kPadSeqLenQ,
                 kPadSeqLenK,
-                kPadHeadDimQ,
-                kPadHeadDimV,
+                kPadHeadDim, // kPadHeadDimQ,
+                kPadHeadDim, // kPadHeadDimV,
                 has_attn_bias,
                 false, // kStoreLSE
-                false, // kHasDropout
+                kHasDropout,
                 occupancy>;
 
             using FmhaPipelineProblem =
@@ -109,7 +114,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                 typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
                 typename FmhaFwdTypeConfig<scalar_t>::ODataType,
                 kPadSeqLenQ,
-                kPadHeadDimV>>;
+                kPadHeadDim>>;
 
             using FmhaKernel =
                 FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
@@ -126,7 +131,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                     true, // kPadHeadDimV,
                     has_attn_bias,
                     false, // kStoreLSE
-                    false, // kHasDropout
+                    kHasDropout,
                     occupancy>;
 
                 using FmhaPipelineProblem =
@@ -198,7 +203,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
           param.window_size,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
-          0.0f, // p_dropout
+          param.dropout_prob, // dropout ratio
           false, // is_store_randval
           {0, 0});
     }();
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 37e9210c9..897b1f2b6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -154,7 +154,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           param.window_size,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
-          param.use_dropout ? param.dropout_prob : 0.0f, // dropout ratio
+          param.dropout_prob,
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 7c09e2659..87a87d134 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -59,6 +59,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask = ck::tile_program::block::
           GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
@@ -74,8 +75,14 @@ struct grouped_infer_causalmask_attnbias_dispatched {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
-      BOOL_SWITCH_2(
-          pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+      BOOL_SWITCH_3(
+          has_dropout,
+          kHasDropout,
+          pad_headdim_q,
+          kPadHeadDimQ,
+          pad_headdim_v,
+          kPadHeadDimV,
+          [&] {
             using FmhaTraits = ck::tile_program::TileFmhaTraits<
                 kPadSeqLenQ,
                 kPadSeqLenK,
@@ -83,7 +90,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                 kPadHeadDimV,
                 has_attn_bias,
                 false, // kStoreLSE
-                false, // kHasDropout
+                kHasDropout,
                 occupancy>;
 
             using FmhaPipelineProblem =
@@ -145,7 +152,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           param.window_size,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
-          0.0f, // p_dropout
+          param.dropout_prob,
           false, // is_store_randval
           {0, 0});
     }();
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 7f2878487..e97db1e86 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -43,7 +43,6 @@ struct BatchedInferParams {
 };
 
 struct BatchedForwardParams : public BatchedInferParams {
-  bool use_dropout;
   bool compute_logsumexp;
 
   float dropout_prob;
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 00aa1b02b..acc06f438 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -172,7 +172,7 @@ class FwOp(AttentionFwOpBase):
         BlockDiagonalCausalLocalAttentionFromBottomRightMask,
     }
 
-    SUPPORTS_DROPOUT = False
+    SUPPORTS_DROPOUT = True
     SUPPORTS_CUSTOM_SCALE = True
     SUPPORTS_DIFFERENT_VALUE_EMBED = True
     SUPPORTS_BMGHK = True

From 930bb257453f083e1fd63f491aed50bb95f5b5a3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 8 Apr 2024 14:24:46 +0000
Subject: [PATCH 510/641] Update to support test_dropout and
 test_dropout_backward tests

---
 .../hip_fmha/attention_ck_rand_uniform.cpp    |  5 ++--
 xformers/ops/fmha/dispatch.py                 | 24 ++++++++++++-------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index 3933b6c5e..2f55d425a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -54,11 +54,12 @@ at::Tensor rand_uniform_int(
   at::Tensor randvals;
 
   randvals = at::empty(
-      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Int));
+      {B, num_heads, M, N}, out_pattern.options().dtype(at::ScalarType::Byte));
 
   {
+    // only work for batched mode
     using FmhaRandUniformKernel_ =
-        FmhaRandUniformKernel<128, 64, 32, int32_t, false>;
+        FmhaRandUniformKernel<128, 64, 32, uint8_t, false>;
 
     const auto kargs = FmhaRandUniformKernel_::MakeKargs(
         randvals.data_ptr(),
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 5bd343eb7..b65708395 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -134,15 +134,21 @@ def _is_cutlassB_faster_than_flash(inp: Inputs) -> bool:
 
 
 def _dispatch_bw(inp: Inputs) -> Type[AttentionBwOpBase]:
-    priority_list_ops: List[Type[AttentionBwOpBase]] = [
-        flash.BwOp,
-        cutlass.BwOp,
-        # CUDA illegal memory issues, race conditions etc..
-        # triton.BwOp,
-        # Deprecated
-        small_k.BwOp,
-    ]
-    if _is_cutlassB_faster_than_flash(inp):
+    if torch.version.cuda:
+        priority_list_ops: List[Type[AttentionBwOpBase]] = [
+            flash.BwOp,
+            cutlass.BwOp,
+            # CUDA illegal memory issues, race conditions etc..
+            # triton.BwOp,
+            # Deprecated
+            small_k.BwOp,
+        ]
+    else:
+        priority_list_ops = [
+                ck.BwOp,
+            ]
+
+    if torch.version.cuda and _is_cutlassB_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.BwOp)
         priority_list_ops.insert(0, cutlass.BwOp)
     return _run_priority_list(

From bdbc956c91d3380c870b284758e4ef6aac1b2098 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 9 Apr 2024 18:44:52 +0000
Subject: [PATCH 511/641] Update the padding method in batched_backward.h

---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h    | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index a104ce4c7..85f1abb80 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -121,16 +121,12 @@ struct batched_backward_causalmask_attnbias_dispatched {
 
         const bool pad_seqlen_q = !(param.M % FmhaBwdShape_::kM0 == 0);
         const bool pad_seqlen_k = !(param.N % FmhaBwdShape_::kN0 == 0);
-        // const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
+        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
         const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kK2 == 0);
 
         // usually headdim_q and headdim_v are same, consider them together
         // to determine whether to do padding saving some compiling time
-        // bool pad_headdim = (pad_headdim_q || pad_headdim_v);
-
-        // currently headdim padding is not supported due to some atomic_add
-        // issue with bhalf_t
-        constexpr bool kPadHeadDimQ = false;
+        const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
         BOOL_SWITCH_4(
             has_dropout,
@@ -139,14 +135,14 @@ struct batched_backward_causalmask_attnbias_dispatched {
             kPadSeqLenQ,
             pad_seqlen_k,
             kPadSeqLenK,
-            pad_headdim_v,
-            kPadHeadDimV,
+            pad_headdim,
+            kPadHeadDim,
             [&] {
               using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
+                  kPadHeadDim, // kPadHeadDimQ,
+                  kPadHeadDim, // kPadHeadDimV,
                   has_attn_bias,
                   false, // kStoreLSE
                   kHasDropout,

From 44fff2984277696cbc402eb7bd77549bb5fa0788 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 9 Apr 2024 19:06:27 +0000
Subject: [PATCH 512/641] Update the OGradDotO kernel padding method

---
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 63 +++++++++----------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 9587f2d17..7100fbe13 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -120,40 +120,39 @@ struct grouped_backward_causalmask_attnbias_dispatched {
         constexpr bool kPadSeqLenQ = true;
         constexpr bool kPadSeqLenK = true;
 
-        // const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
+        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
         const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kK2 == 0);
 
-        // currently headdim padding is not supported due to some atomic_add
-        // issue with bhalf_t
-        constexpr bool kPadHeadDimQ = false;
-
-        BOOL_SWITCH_2(
-            has_dropout, kHasDropout, pad_headdim_v, kPadHeadDimV, [&] {
-              using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDimQ,
-                  kPadHeadDimV,
-                  has_attn_bias,
-                  false, // kStoreLSE
-                  kHasDropout,
-                  occupancy>;
-
-              using FmhaBwdPipelineProblem =
-                  FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
-
-              using FmhaBwdPipeline_ = typename ck::tile_program::block::
-                  BlockFmhaBwdPipelineDispatcher<
-                      FmhaBwdLoadStrategy_,
-                      FmhaBwdPipelineProblem>::BlockPipeline;
-
-              using FmhaBwdKernel_ = FmhaBwdKernel<
-                  FmhaBwdTilePartitioner_,
-                  FmhaBwdPipeline_,
-                  FmhaBwdEpilogue_>;
-
-              RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
-            });
+        // usually headdim_q and headdim_v are same, consider them together
+        // to determine whether to do padding saving some compiling time
+        const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
+
+        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_headdim, kPadHeadDim, [&] {
+          using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
+              kPadSeqLenQ,
+              kPadSeqLenK,
+              kPadHeadDim, // kPadHeadDimQ,
+              kPadHeadDim, // kPadHeadDimV,
+              has_attn_bias,
+              false, // kStoreLSE
+              kHasDropout,
+              occupancy>;
+
+          using FmhaBwdPipelineProblem =
+              FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
+
+          using FmhaBwdPipeline_ =
+              typename ck::tile_program::block::BlockFmhaBwdPipelineDispatcher<
+                  FmhaBwdLoadStrategy_,
+                  FmhaBwdPipelineProblem>::BlockPipeline;
+
+          using FmhaBwdKernel_ = FmhaBwdKernel<
+              FmhaBwdTilePartitioner_,
+              FmhaBwdPipeline_,
+              FmhaBwdEpilogue_>;
+
+          RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
+        });
       });
     };
   }

From d5c2d88e04f5b188962299913175e53958a0d68f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 9 Apr 2024 21:28:46 +0000
Subject: [PATCH 513/641] Change the backward padding checking condition

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h  | 4 ++--
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 85f1abb80..5b871628f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -121,8 +121,8 @@ struct batched_backward_causalmask_attnbias_dispatched {
 
         const bool pad_seqlen_q = !(param.M % FmhaBwdShape_::kM0 == 0);
         const bool pad_seqlen_k = !(param.N % FmhaBwdShape_::kN0 == 0);
-        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
-        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kK2 == 0);
+        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kQKHeaddim == 0);
+        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kVHeaddim == 0);
 
         // usually headdim_q and headdim_v are same, consider them together
         // to determine whether to do padding saving some compiling time
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 7100fbe13..2e7f73cef 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -120,8 +120,8 @@ struct grouped_backward_causalmask_attnbias_dispatched {
         constexpr bool kPadSeqLenQ = true;
         constexpr bool kPadSeqLenK = true;
 
-        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kK0 == 0);
-        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kK2 == 0);
+        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kQKHeaddim == 0);
+        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kVHeaddim == 0);
 
         // usually headdim_q and headdim_v are same, consider them together
         // to determine whether to do padding saving some compiling time

From ce9c23c8c030f3de7927af9e93cdf49bd8ae2457 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Apr 2024 15:00:05 +0000
Subject: [PATCH 514/641] Add batch_stride_lse/d parameters to adapt grouped
 mode forward/backward to [num_batches, H, MaxSeqlenQ] layout

---
 third_party/composable_kernel_tiled                             | 2 +-
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h    | 2 ++
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h     | 1 +
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h  | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 132bd39f0..6bb26d084 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 132bd39f02b7f5a04f9619c7dfd28efe9931971c
+Subproject commit 6bb26d084d4201531797c7b79f7ece723687352d
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 2e7f73cef..c44440485 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -174,6 +174,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.grad_out_strides[1], // nhead_stride_do
           param.out_strides[1], // nhead_stride_o
           param.lsed_strides[1],
+          param.lsed_strides[0], // batch_stride_d
           param.grad_out_strides[2]); // hdim_stride_do
     }();
 
@@ -238,6 +239,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
+          param.lsed_strides[0], // batch_stride_lse
           param.grad_out_strides[2], // hdim_stride_do
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 897b1f2b6..d50e18431 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -150,6 +150,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           0, // nhead_stride_randval
           param.lse_strides[1],
           param.out_strides[1],
+          param.lse_strides[0], // batch_stride_lse
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
           1.0f, // descale_qk, not used
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 87a87d134..b710d464c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -148,6 +148,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           0, // nhead_stride_randval
           0, // nhead_stride_lse
           param.out_strides[1],
+          0, // batch_stride_lse
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
           1.0f, // descale_qk, not used

From dafea78de208f74a23523adaaf5b16c96047fb40 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 10 Apr 2024 16:42:09 +0000
Subject: [PATCH 515/641] Fill the grad_bias in advance

---
 .../hip_fmha/attention_backward_generic_ck_tiled.cpp        | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 065cd6484..ac4bceeef 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -173,9 +173,13 @@ efficient_attention_backward_ck(
 
   // even it is an output, the grad_bias is required to use the same data-type
   // as bias in CK-FlashAttn
-  if (bias_requires_grad)
+  if (bias_requires_grad) {
     grad_bias =
         at::empty_strided(bias->sizes(), bias->strides(), bias->options());
+    // cleaning is needed since masked tile does no outputting in our
+    // implementation
+    grad_bias.fill_(0);
+  }
 
   bool is_mqa_gqa = (Hq > Hkv);
 

From 06ad68975b073bbbb36e641468b549b4c4e00ebc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 11 Apr 2024 07:41:26 +0000
Subject: [PATCH 516/641] Add support for kHasBiasGrad as instance template

---
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h |  4 +
 .../ck_tiled_fmha_batched_backward_bp16.cpp   | 83 ++++++++++++-------
 .../ck_tiled_fmha_batched_backward_fp16.cpp   | 83 ++++++++++++-------
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  |  1 +
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  1 +
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h |  4 +
 .../ck_tiled_fmha_grouped_backward_bp16.cpp   | 83 ++++++++++++-------
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   | 83 ++++++++++++-------
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  1 +
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  1 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 ...6_with_causalmask_no_attnbias_maxk_128.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_32.cpp |  1 +
 ...16_with_causalmask_no_attnbias_maxk_64.cpp |  1 +
 ...ask_with_attnbias_no_biasgrad_maxk_128.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_32.cpp | 16 ++++
 ...mask_with_attnbias_no_biasgrad_maxk_64.cpp | 16 ++++
 ..._with_attnbias_with_biasgrad_maxk_128.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_32.cpp} |  1 +
 ...k_with_attnbias_with_biasgrad_maxk_64.cpp} |  1 +
 83 files changed, 657 insertions(+), 121 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp => ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp => ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp} (97%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp => ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp} (97%)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 6bb26d084..617dd51bb 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 6bb26d084d4201531797c7b79f7ece723687352d
+Subproject commit 617dd51bb8f85488e9c73c498cd6fc7b6b002b42
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 5b871628f..688acc70b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -33,6 +33,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
+    bool has_bias_grad,
     ck::index_t MaxK>
 struct batched_backward_causalmask_attnbias_dispatched {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
@@ -144,6 +145,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
                   kPadHeadDim, // kPadHeadDimQ,
                   kPadHeadDim, // kPadHeadDimV,
                   has_attn_bias,
+                  has_bias_grad,
                   false, // kStoreLSE
                   kHasDropout,
                   occupancy>;
@@ -288,6 +290,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
+    bool has_bias_grad,
     ck::index_t MaxK>
 void run_batched_backward_causalmask_attnbias_dispatched(
     BatchedBackwardParams& param,
@@ -296,5 +299,6 @@ void run_batched_backward_causalmask_attnbias_dispatched(
       scalar_t,
       has_causal_mask,
       has_attn_bias,
+      has_bias_grad,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
index bbcbe8784..f82fdc061 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
@@ -13,51 +13,74 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
-      if (param.custom_mask_type == 0)
-        run_batched_backward_causalmask_attnbias_dispatched<
-            ck::bhalf_t,
-            false,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_backward_causalmask_attnbias_dispatched<
-            ck::bhalf_t,
-            true,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else
-        throw std::runtime_error("Invalid custom_mask_type value");
-    });
-  });
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.bias_has_grad,
+      HAS_BIAS_GRAD,
+      [&] {
+        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+          FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+            if (param.custom_mask_type == 0)
+              run_batched_backward_causalmask_attnbias_dispatched<
+                  ck::bhalf_t,
+                  false,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+              run_batched_backward_causalmask_attnbias_dispatched<
+                  ck::bhalf_t,
+                  true,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else
+              throw std::runtime_error("Invalid custom_mask_type value");
+          });
+        } else
+          throw std::runtime_error(
+              "bias_has_grad should be false when has_attn_bias is false!");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index 35df8c293..f8395acdb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -13,51 +13,74 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 32>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
-      if (param.custom_mask_type == 0)
-        run_batched_backward_causalmask_attnbias_dispatched<
-            ck::half_t,
-            false,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_backward_causalmask_attnbias_dispatched<
-            ck::half_t,
-            true,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else
-        throw std::runtime_error("Invalid custom_mask_type value");
-    });
-  });
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.bias_has_grad,
+      HAS_BIAS_GRAD,
+      [&] {
+        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+          FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+            if (param.custom_mask_type == 0)
+              run_batched_backward_causalmask_attnbias_dispatched<
+                  ck::half_t,
+                  false,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+              run_batched_backward_causalmask_attnbias_dispatched<
+                  ck::half_t,
+                  true,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else
+              throw std::runtime_error("Invalid custom_mask_type value");
+          });
+        } else
+          throw std::runtime_error(
+              "bias_has_grad should be false when has_attn_bias is false!");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 251ee9fbf..6a0ef0a43 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -99,6 +99,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
                 kPadHeadDim, // kPadHeadDimQ
                 kPadHeadDim, // kPadHeadDimV
                 has_attn_bias,
+                false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
                 kHasDropout,
                 occupancy>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 6e448fd3f..107f2628e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -99,6 +99,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                 kPadHeadDim, // kPadHeadDimQ,
                 kPadHeadDim, // kPadHeadDimV,
                 has_attn_bias,
+                false, // kHasBiasGrad place-holder
                 false, // kStoreLSE
                 kHasDropout,
                 occupancy>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index c44440485..278053038 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -33,6 +33,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
+    bool has_bias_grad,
     ck::index_t MaxK>
 struct grouped_backward_causalmask_attnbias_dispatched {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
@@ -134,6 +135,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
               kPadHeadDim, // kPadHeadDimQ,
               kPadHeadDim, // kPadHeadDimV,
               has_attn_bias,
+              has_bias_grad,
               false, // kStoreLSE
               kHasDropout,
               occupancy>;
@@ -267,6 +269,7 @@ template <
     typename scalar_t,
     bool has_causal_mask,
     bool has_attn_bias,
+    bool has_bias_grad,
     ck::index_t MaxK>
 void run_grouped_backward_causalmask_attnbias_dispatched(
     GroupedBackwardParams& param,
@@ -275,5 +278,6 @@ void run_grouped_backward_causalmask_attnbias_dispatched(
       scalar_t,
       has_causal_mask,
       has_attn_bias,
+      has_bias_grad,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
index 0553bbcb1..10337fcd2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
@@ -13,51 +13,74 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
-      if (param.custom_mask_type == 0)
-        run_grouped_backward_causalmask_attnbias_dispatched<
-            ck::bhalf_t,
-            false,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_backward_causalmask_attnbias_dispatched<
-            ck::bhalf_t,
-            true,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else
-        throw std::runtime_error("Invalid custom_mask_type value");
-    });
-  });
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.bias_has_grad,
+      HAS_BIAS_GRAD,
+      [&] {
+        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+          FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+            if (param.custom_mask_type == 0)
+              run_grouped_backward_causalmask_attnbias_dispatched<
+                  ck::bhalf_t,
+                  false,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+              run_grouped_backward_causalmask_attnbias_dispatched<
+                  ck::bhalf_t,
+                  true,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else
+              throw std::runtime_error("Invalid custom_mask_type value");
+          });
+        } else
+          throw std::runtime_error(
+              "bias_has_grad should be false when has_attn_bias is false!");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index e4522de89..ef2e0bb8b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -13,51 +13,74 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 32>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
-    FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
-      if (param.custom_mask_type == 0)
-        run_grouped_backward_causalmask_attnbias_dispatched<
-            ck::half_t,
-            false,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_backward_causalmask_attnbias_dispatched<
-            ck::half_t,
-            true,
-            HAS_ATTN_BIAS,
-            MaxK>(param, stream);
-      else
-        throw std::runtime_error("Invalid custom_mask_type value");
-    });
-  });
+  BOOL_SWITCH_2(
+      param.has_attn_bias,
+      HAS_ATTN_BIAS,
+      param.bias_has_grad,
+      HAS_BIAS_GRAD,
+      [&] {
+        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+          FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
+            if (param.custom_mask_type == 0)
+              run_grouped_backward_causalmask_attnbias_dispatched<
+                  ck::half_t,
+                  false,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
+              run_grouped_backward_causalmask_attnbias_dispatched<
+                  ck::half_t,
+                  true,
+                  HAS_ATTN_BIAS,
+                  HAS_BIAS_GRAD,
+                  MaxK>(param, stream);
+            else
+              throw std::runtime_error("Invalid custom_mask_type value");
+          });
+        } else
+          throw std::runtime_error(
+              "bias_has_grad should be false when has_attn_bias is false!");
+      });
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index d50e18431..360c9c9c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -89,6 +89,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
                 kPadHeadDimQ,
                 kPadHeadDimV,
                 has_attn_bias,
+                false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
                 kHasDropout,
                 occupancy>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index b710d464c..347f0de16 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -89,6 +89,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                 kPadHeadDimQ,
                 kPadHeadDimV,
                 has_attn_bias,
+                false, // kHasBiasGrad place-holder
                 false, // kStoreLSE
                 kHasDropout,
                 occupancy>;
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index 67c5b042f..fd19dba04 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index 7842cc14e..2abde7a13 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index f357331c7..392e0df61 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..2dc4036ab
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..b634ec861
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..572667e05
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index ae87f436d..410a00133 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     true,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index 27b50a8a6..0eb83776e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     true,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index c0944682c..30a9d3e06 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     true,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
index 3329e61b6..390c057a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
index 2affa3ff9..6d9e8db05 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
index 7b3c001fe..f37923f72 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..4154b0e51
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..c6ef4a6ad
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..5ea0440a9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 15b46c6e9..23dcdbd74 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     true,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index 29cb04307..cea2dc49f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     true,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 9c28e4a53..ebf213e77 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     true,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index 24a39ad28..ad1018234 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index ebf7765ac..ed71783b8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index 03418ee58..35bb6ac5f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..0d8369353
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..043d4357c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..48013f08d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 315950620..d6e30d22a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     true,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index 1ddf23a3b..f46573924 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     true,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 4f09b8fe1..fc7974038 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     true,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
index 89066e511..b2b0d96f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
index bc7c12971..4b63b34e4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
index d53fa0dbe..c7e2c84b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..b611084db
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..a0156e2c4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..2685736f4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 8d2535cfb..3d03144e1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     true,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index 3754898df..130922e0d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     true,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 991a285c9..974fe1752 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_batched_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     true,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index 343cbfcba..7e92e2be5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index 484edc279..27e119c5c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index 5e1a6bba0..b2149eafb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..a703e7b1b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..a57d05f37
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..4dd74235e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 9e93e28ea..9ab625aed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     true,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index 84d0377ed..a8a3c66fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     true,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 7fc71497e..29ec58440 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     false,
     true,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
index 1bed5bed0..26146e7b9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
index 635e9c390..eec45177f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
index af52c955f..f55ada6a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..1b045b39b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..68bb20d86
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..6fab84344
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::bhalf_t,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 495ad8580..ccf93c6eb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     true,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index a487c5db2..571012eba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     true,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 360970962..7f4c7a6c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::bhalf_t,
     true,
     true,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index 3547d310f..1a59b5a0a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index 24aeb3aee..7689feaac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index e3e51ae4a..89b2ab475 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..e25e0c755
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..18e9ea80d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..23e7cd1e5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    false,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 67e153ffc..2904aa886 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     true,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index ec7336a51..75680aad1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     true,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 13a5d40eb..d7625e4dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     false,
     true,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
index 058f08c65..3b0cd4b76 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
index 469b2d2e4..e3055cffe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
index 3675cd20a..1d2ae1a98 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
new file mode 100644
index 000000000..a082bcb80
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
new file mode 100644
index 000000000..59165bbe8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
new file mode 100644
index 000000000..cbf262e7a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_attnbias_dispatched<
+    ck::half_t,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
index 0433020e0..d32f76ef3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     true,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
index 322c41f15..b3cf3fa5c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     true,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
similarity index 97%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
index 885e757c8..6b6fe1383 100644
--- a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
@@ -12,4 +12,5 @@ template void run_grouped_backward_causalmask_attnbias_dispatched<
     ck::half_t,
     true,
     true,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);

From bdd6291a1bc316fc82d2c41449577d122ce135ec Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 11 Apr 2024 15:00:48 +0000
Subject: [PATCH 517/641] Remove using hdim_stride_do in fmha backward

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h  | 4 +---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h  | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 688acc70b..9f2fcf8b1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -188,8 +188,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.lsed_strides[1], // nhead_stride_d
           param.grad_out_strides[0], // batch_stride_do
           param.out_strides[0], // batch_stride_o
-          param.lsed_strides[0], // batch_stride_d
-          param.grad_out_strides[3]); // hdim_stride_do
+          param.lsed_strides[0]); // batch_stride_d
     }();
 
     dim3 kGridSize =
@@ -264,7 +263,6 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.grad_v_strides[0],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
-          param.grad_out_strides[3], // hdim_stride_do
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
           param.dropout_prob, // dropout ratio
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 278053038..31ed265fa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -176,8 +176,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.grad_out_strides[1], // nhead_stride_do
           param.out_strides[1], // nhead_stride_o
           param.lsed_strides[1],
-          param.lsed_strides[0], // batch_stride_d
-          param.grad_out_strides[2]); // hdim_stride_do
+          param.lsed_strides[0]); // batch_stride_d
     }();
 
     dim3 kGridSize = FmhaBwdOGradDotOKernel::GridSize(
@@ -242,7 +241,6 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           param.lsed_strides[0], // batch_stride_lse
-          param.grad_out_strides[2], // hdim_stride_do
           static_cast<CausalMaskType>(param.custom_mask_type),
           param.window_size,
           param.dropout_prob, // dropout ratio

From 410f814a35fdf37c6ea3b185cb99c76dd1e495a8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 11 Apr 2024 16:18:55 +0000
Subject: [PATCH 518/641] Force kPadSeqLenQ/kPadSeqLenK to be true in
 batched-backward to save compiling time

---
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 68 ++++++++-----------
 2 files changed, 31 insertions(+), 39 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 617dd51bb..de0f8161b 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 617dd51bb8f85488e9c73c498cd6fc7b6b002b42
+Subproject commit de0f8161bf7533f650dbbd47be941a1ffff53e76
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 9f2fcf8b1..b7c40de7f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -120,8 +120,9 @@ struct batched_backward_causalmask_attnbias_dispatched {
         using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
         using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
 
-        const bool pad_seqlen_q = !(param.M % FmhaBwdShape_::kM0 == 0);
-        const bool pad_seqlen_k = !(param.N % FmhaBwdShape_::kN0 == 0);
+        constexpr bool kPadSeqLenQ = true;
+        constexpr bool kPadSeqLenK = true;
+
         const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kQKHeaddim == 0);
         const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kVHeaddim == 0);
 
@@ -129,42 +130,33 @@ struct batched_backward_causalmask_attnbias_dispatched {
         // to determine whether to do padding saving some compiling time
         const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
-        BOOL_SWITCH_4(
-            has_dropout,
-            kHasDropout,
-            pad_seqlen_q,
-            kPadSeqLenQ,
-            pad_seqlen_k,
-            kPadSeqLenK,
-            pad_headdim,
-            kPadHeadDim,
-            [&] {
-              using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
-                  kPadSeqLenQ,
-                  kPadSeqLenK,
-                  kPadHeadDim, // kPadHeadDimQ,
-                  kPadHeadDim, // kPadHeadDimV,
-                  has_attn_bias,
-                  has_bias_grad,
-                  false, // kStoreLSE
-                  kHasDropout,
-                  occupancy>;
-
-              using FmhaBwdPipelineProblem =
-                  FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
-
-              using FmhaBwdPipeline_ = typename ck::tile_program::block::
-                  BlockFmhaBwdPipelineDispatcher<
-                      FmhaBwdLoadStrategy_,
-                      FmhaBwdPipelineProblem>::BlockPipeline;
-
-              using FmhaBwdKernel_ = FmhaBwdKernel<
-                  FmhaBwdTilePartitioner_,
-                  FmhaBwdPipeline_,
-                  FmhaBwdEpilogue_>;
-
-              RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
-            });
+        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_headdim, kPadHeadDim, [&] {
+          using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
+              kPadSeqLenQ,
+              kPadSeqLenK,
+              kPadHeadDim, // kPadHeadDimQ,
+              kPadHeadDim, // kPadHeadDimV,
+              has_attn_bias,
+              has_bias_grad,
+              false, // kStoreLSE
+              kHasDropout,
+              occupancy>;
+
+          using FmhaBwdPipelineProblem =
+              FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
+
+          using FmhaBwdPipeline_ =
+              typename ck::tile_program::block::BlockFmhaBwdPipelineDispatcher<
+                  FmhaBwdLoadStrategy_,
+                  FmhaBwdPipelineProblem>::BlockPipeline;
+
+          using FmhaBwdKernel_ = FmhaBwdKernel<
+              FmhaBwdTilePartitioner_,
+              FmhaBwdPipeline_,
+              FmhaBwdEpilogue_>;
+
+          RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
+        });
       });
     };
   }

From 2712dff109043025eb1284a7c1c6236aa1e26f36 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 12 Apr 2024 23:27:37 +0000
Subject: [PATCH 519/641] Fix missing passing of {philox_seed, philox_offset}
 in inference path

---
 third_party/composable_kernel_tiled                            | 2 +-
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h | 2 +-
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index de0f8161b..bb57f31fd 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit de0f8161bf7533f650dbbd47be941a1ffff53e76
+Subproject commit bb57f31fdc290bb7bc4df6af35c736b7c00f2a3c
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 107f2628e..96585c13d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -206,7 +206,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
           1.0f, // descale_sv, not used
           param.dropout_prob, // dropout ratio
           false, // is_store_randval
-          {0, 0});
+          {param.philox_seed, param.philox_offset});
     }();
 
     dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 347f0de16..bfaa55c32 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -156,7 +156,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           1.0f, // descale_sv, not used
           param.dropout_prob,
           false, // is_store_randval
-          {0, 0});
+          {param.philox_seed, param.philox_offset});
     }();
 
     dim3 kGridSize = FmhaKernel::GridSize(

From 7c27a820966b276ad73c91f5736501d6d7375677 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 14 Apr 2024 17:58:48 +0000
Subject: [PATCH 520/641] Use SimplifiedGenericAttentionMask to replace
 GenericAttentionMask

---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h        | 12 +++++++-----
 .../hip_fmha/ck_tiled_fmha_batched_forward.h         | 11 ++++++-----
 .../attention/hip_fmha/ck_tiled_fmha_batched_infer.h | 11 ++++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h        | 12 +++++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h         | 11 ++++++-----
 .../attention/hip_fmha/ck_tiled_fmha_grouped_infer.h | 11 ++++++-----
 6 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index b7c40de7f..ccc7e7d3a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -27,7 +27,6 @@
 #include "ck_tiled_fmha_backward_kernel.hpp"
 #include "ck_tiled_fmha_bwd_epilogue.hpp"
 #include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
-#include "ck_tiled_fmha_definitions.hpp"
 
 template <
     typename scalar_t,
@@ -114,8 +113,9 @@ struct batched_backward_causalmask_attnbias_dispatched {
         constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 
-        using FmhaMask = ck::tile_program::block::
-            GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+        using FmhaMask =
+            ck::tile_program::block::SimplifiedGenericAttentionMask<
+                has_masking>;
 
         using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
         using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
@@ -255,8 +255,10 @@ struct batched_backward_causalmask_attnbias_dispatched {
           param.grad_v_strides[0],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
-          static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size,
+          (param.window_size > 0) ? param.window_size - 1
+                                  : -1, // window_left_size
+          (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
+          param.custom_mask_type,
           param.dropout_prob, // dropout ratio
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 6a0ef0a43..bce607f91 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -24,7 +24,6 @@
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "ck_tiled_fmha_definitions.hpp"
 #include "ck_tiled_fmha_forward_kernel.hpp"
 #include "ck_tiled_fmha_fwd_epilogue.hpp"
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
@@ -61,8 +60,8 @@ struct batched_forward_causalmask_attnbias_dispatched {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
-      using FmhaMask = ck::tile_program::block::
-          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+      using FmhaMask =
+          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
       using FmhaFwdTilePartitioner_ = FmhaFwdTilePartitioner<FmhaFwdShape_>;
@@ -200,8 +199,10 @@ struct batched_forward_causalmask_attnbias_dispatched {
           0, // batch_stride_randval
           param.lse_strides[0], // batch_stride_lse
           param.out_strides[0],
-          static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size,
+          (param.window_size > 0) ? param.window_size - 1
+                                  : -1, // window_left_size
+          (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
+          param.custom_mask_type,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
           param.dropout_prob, // dropout ratio
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 96585c13d..4da93e6d3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -25,7 +25,6 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "ck_tiled_fmha_definitions.hpp"
 #include "ck_tiled_fmha_forward_kernel.hpp"
 #include "ck_tiled_fmha_fwd_epilogue.hpp"
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
@@ -62,8 +61,8 @@ struct batched_infer_causalmask_attnbias_dispatched {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
-      using FmhaMask = ck::tile_program::block::
-          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+      using FmhaMask =
+          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
@@ -200,8 +199,10 @@ struct batched_infer_causalmask_attnbias_dispatched {
           0, // batch_stride_randval
           0, // batch_stride_lse
           param.out_strides[0],
-          static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size,
+          (param.window_size > 0) ? param.window_size - 1
+                                  : -1, // window_left_size
+          (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
+          param.custom_mask_type,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
           param.dropout_prob, // dropout ratio
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 31ed265fa..0adda65cf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -27,7 +27,6 @@
 #include "ck_tiled_fmha_backward_kernel.hpp"
 #include "ck_tiled_fmha_bwd_epilogue.hpp"
 #include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
-#include "ck_tiled_fmha_definitions.hpp"
 
 template <
     typename scalar_t,
@@ -112,8 +111,9 @@ struct grouped_backward_causalmask_attnbias_dispatched {
         constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 
-        using FmhaMask = ck::tile_program::block::
-            GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+        using FmhaMask =
+            ck::tile_program::block::SimplifiedGenericAttentionMask<
+                has_masking>;
 
         using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
         using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
@@ -241,8 +241,10 @@ struct grouped_backward_causalmask_attnbias_dispatched {
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           param.lsed_strides[0], // batch_stride_lse
-          static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size,
+          (param.window_size > 0) ? param.window_size - 1
+                                  : -1, // window_left_size
+          (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
+          param.custom_mask_type,
           param.dropout_prob, // dropout ratio
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 360c9c9c1..2e4458d7f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -23,7 +23,6 @@
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "ck_tiled_fmha_definitions.hpp"
 #include "ck_tiled_fmha_forward_kernel.hpp"
 #include "ck_tiled_fmha_fwd_epilogue.hpp"
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
@@ -60,8 +59,8 @@ struct grouped_forward_causalmask_attnbias_dispatched {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
-      using FmhaMask = ck::tile_program::block::
-          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+      using FmhaMask =
+          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
       using FmhaFwdTilePartitioner_ = FmhaFwdTilePartitioner<FmhaFwdShape_>;
@@ -152,8 +151,10 @@ struct grouped_forward_causalmask_attnbias_dispatched {
           param.lse_strides[1],
           param.out_strides[1],
           param.lse_strides[0], // batch_stride_lse
-          static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size,
+          (param.window_size > 0) ? param.window_size - 1
+                                  : -1, // window_left_size
+          (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
+          param.custom_mask_type,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
           param.dropout_prob,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index bfaa55c32..5c44c772c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -24,7 +24,6 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "ck_tiled_fmha_definitions.hpp"
 #include "ck_tiled_fmha_forward_kernel.hpp"
 #include "ck_tiled_fmha_fwd_epilogue.hpp"
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
@@ -61,8 +60,8 @@ struct grouped_infer_causalmask_attnbias_dispatched {
       constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
-      using FmhaMask = ck::tile_program::block::
-          GenericAttentionMask<has_masking, USE_LOCAL_ATTENTION>;
+      using FmhaMask =
+          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaShape = FmhaFwdShape<MaxK>;
       using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
@@ -150,8 +149,10 @@ struct grouped_infer_causalmask_attnbias_dispatched {
           0, // nhead_stride_lse
           param.out_strides[1],
           0, // batch_stride_lse
-          static_cast<CausalMaskType>(param.custom_mask_type),
-          param.window_size,
+          (param.window_size > 0) ? param.window_size - 1
+                                  : -1, // window_left_size
+          (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
+          param.custom_mask_type,
           1.0f, // descale_qk, not used
           1.0f, // descale_sv, not used
           param.dropout_prob,

From 46c491ee3a680b720b94fc67e729bca99b74fa9f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 14 Apr 2024 23:20:07 +0000
Subject: [PATCH 521/641] Shorten the instance file names

---
 third_party/composable_kernel_tiled                             | 2 +-
 ..._bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...d_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...d_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...d_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...rd_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...rd_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...tched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...atched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...atched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...d_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...rd_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...rd_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...rd_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...ard_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...ard_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...atched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...d_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...d_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...d_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...rd_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...rd_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...tched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...atched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...atched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...d_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...rd_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...rd_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...rd_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...ard_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...ard_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...atched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...tched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...tched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...atched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...atched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...atched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...atched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...atched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...atched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ..._batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ..._batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...tched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...tched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...atched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...atched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...atched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...atched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...atched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...atched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ..._batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ..._batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ..._batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ..._batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ..._batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ..._batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...a_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...a_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ..._batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...a_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...a_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...a_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...a_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...ha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...ha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ..._batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ..._batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ..._batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ..._batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...a_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...a_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ..._batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...a_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...a_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...a_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...a_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...ha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...ha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...d_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...d_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...d_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...rd_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...rd_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...ouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...rouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...rouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...d_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...rd_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...rd_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...rd_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...ard_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...ard_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...rouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...d_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...d_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...d_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...rd_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...rd_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...ouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...rouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...rouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...d_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} | 0
 ...rd_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} | 0
 ...rd_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} | 0
 ...rd_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} | 0
 ...ard_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} | 0
 ...ard_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} | 0
 ...rouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...ouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...ouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...rouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...rouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...rouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...rouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...rouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...rouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ..._grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ..._grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...ouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...ouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...rouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...rouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...rouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...rouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...rouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...rouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ..._grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ..._grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ..._grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ..._grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ..._grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ..._grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...a_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...a_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ..._grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...a_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...a_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...a_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...a_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...ha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...ha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 ...grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp} | 0
 ...grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp} | 0
 ..._grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp} | 0
 ..._grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp} | 0
 ..._grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp} | 0
 ..._grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...a_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...a_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp} | 0
 ..._grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp} | 0
 ..._grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp} | 0
 ...a_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp} | 0
 ...a_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp} | 0
 ...a_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp} | 0
 ...a_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp} | 0
 ...ha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp} | 0
 ...ha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp} | 0
 201 files changed, 1 insertion(+), 1 deletion(-)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp} (100%)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index bb57f31fd..131f660b2 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit bb57f31fdc290bb7bc4df6af35c736b7c00f2a3c
+Subproject commit 131f660b24c450f819f1ebe4698afcbe6155d9b9
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_with_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_with_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_with_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_with_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
diff --git a/xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/ck_tiled_fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp

From 4c6c08d470434ea5a89ea93caef5e328b7bef32e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 14 Apr 2024 23:47:44 +0000
Subject: [PATCH 522/641] Rename the template parameters

---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 72 +++++++++----------
 .../ck_tiled_fmha_batched_backward_bp16.cpp   | 16 ++---
 .../ck_tiled_fmha_batched_backward_fp16.cpp   | 16 ++---
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 54 +++++++-------
 .../ck_tiled_fmha_batched_forward_bp16.cpp    |  6 +-
 .../ck_tiled_fmha_batched_forward_fp16.cpp    |  6 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 54 +++++++-------
 .../ck_tiled_fmha_batched_infer_bp16.cpp      |  6 +-
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  6 +-
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 72 +++++++++----------
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   | 16 ++---
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 48 ++++++-------
 .../ck_tiled_fmha_grouped_forward_bp16.cpp    |  6 +-
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    |  6 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 48 ++++++-------
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      |  6 +-
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  6 +-
 17 files changed, 216 insertions(+), 228 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index ccc7e7d3a..9af5bf1c3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -29,37 +29,37 @@
 #include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
-    bool has_bias_grad,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
+    bool kHasBiasGrad,
     ck::index_t MaxK>
 struct batched_backward_causalmask_attnbias_dispatched {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
-      typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
-      typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
-      typename FmhaBwdTypeConfig<scalar_t>::VGradDataType>>;
+      typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::VGradDataType>>;
 
   using FmhaBwdLoadStrategy_ = typename FmhaBwdLoadStrategy<MaxK>::type;
 
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaBwdPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaBwdPipelineProblem<
-          typename FmhaBwdTypeConfig<scalar_t>::QDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::KDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::VDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::GemmDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::LSEDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::DDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::BiasDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::ODataType,
-          typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::QGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::VGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::BiasGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::QDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::KDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::VDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::GemmDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::LSEDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::DDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::BiasDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::RandValOutputDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+          typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
           FmhaBwdShape<MaxK>,
           false, // kIsGroupMode
           FmhaMask,
@@ -85,9 +85,9 @@ struct batched_backward_causalmask_attnbias_dispatched {
 
             using FmhaBwdOGradDotOPipelineProblem =
                 ck::tile_program::block::BlockFmhaBwdOGradDotOPipelineProblem<
-                    typename FmhaBwdTypeConfig<scalar_t>::ODataType,
-                    typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
-                    typename FmhaBwdTypeConfig<scalar_t>::DDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::DDataType,
                     kBlockSize,
                     FmhaBwdShape<MaxK>::kVHeaddim,
                     false, // kIsGroupMode
@@ -110,7 +110,7 @@ struct batched_backward_causalmask_attnbias_dispatched {
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
         constexpr ck::index_t occupancy = 1;
-        constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+        constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 
         using FmhaMask =
@@ -136,8 +136,8 @@ struct batched_backward_causalmask_attnbias_dispatched {
               kPadSeqLenK,
               kPadHeadDim, // kPadHeadDimQ,
               kPadHeadDim, // kPadHeadDimV,
-              has_attn_bias,
-              has_bias_grad,
+              kHasBias,
+              kHasBiasGrad,
               false, // kStoreLSE
               kHasDropout,
               occupancy>;
@@ -279,18 +279,18 @@ struct batched_backward_causalmask_attnbias_dispatched {
 };
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
-    bool has_bias_grad,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
+    bool kHasBiasGrad,
     ck::index_t MaxK>
 void run_batched_backward_causalmask_attnbias_dispatched(
     BatchedBackwardParams& param,
     hipStream_t stream) {
   batched_backward_causalmask_attnbias_dispatched<
-      scalar_t,
-      has_causal_mask,
-      has_attn_bias,
-      has_bias_grad,
+      ScalarType,
+      kHasCausalMask,
+      kHasBias,
+      kHasBiasGrad,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
index f82fdc061..8d0445ddf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
@@ -55,26 +55,22 @@ extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bha
 
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.bias_has_grad,
-      HAS_BIAS_GRAD,
-      [&] {
-        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+      param.has_attn_bias, kHasBias, param.bias_has_grad, kHasBiasGrad, [&] {
+        if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_batched_backward_causalmask_attnbias_dispatched<
                   ck::bhalf_t,
                   false,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_batched_backward_causalmask_attnbias_dispatched<
                   ck::bhalf_t,
                   true,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index f8395acdb..a0d0cca7d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -55,26 +55,22 @@ extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::hal
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.bias_has_grad,
-      HAS_BIAS_GRAD,
-      [&] {
-        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+      param.has_attn_bias, kHasBias, param.bias_has_grad, kHasBiasGrad, [&] {
+        if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_batched_backward_causalmask_attnbias_dispatched<
                   ck::half_t,
                   false,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_batched_backward_causalmask_attnbias_dispatched<
                   ck::half_t,
                   true,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index bce607f91..ee45f3631 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -29,25 +29,25 @@
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 struct batched_forward_causalmask_attnbias_dispatched {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
           FmhaFwdShape<MaxK>,
           false, // kIsGroupMode
           FmhaMask,
@@ -57,7 +57,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
@@ -97,7 +97,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
                 kPadSeqLenK,
                 kPadHeadDim, // kPadHeadDimQ
                 kPadHeadDim, // kPadHeadDimV
-                has_attn_bias,
+                kHasBias,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
                 kHasDropout,
@@ -111,8 +111,8 @@ struct batched_forward_causalmask_attnbias_dispatched {
                     FmhaPipelineProblem>;
 
             using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
                 kPadSeqLenQ,
                 kPadHeadDim>>;
 
@@ -128,7 +128,7 @@ struct batched_forward_causalmask_attnbias_dispatched {
               BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK,
          [&] { using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits< true, //
          kPadSeqLenQ, kPadSeqLenK, true, // kPadHeadDimQ true, // kPadHeadDimV
-                    has_attn_bias,
+                    kHasBias,
                     true, // kStoreLSE
                     kHasDropout,
                     occupancy>;
@@ -141,8 +141,8 @@ struct batched_forward_causalmask_attnbias_dispatched {
                         FmhaPipelineProblem>;
 
                 using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                    typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                    typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                    typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                    typename FmhaFwdTypeConfig<ScalarType>::ODataType,
                     true,
                     true>>;
 
@@ -226,16 +226,16 @@ struct batched_forward_causalmask_attnbias_dispatched {
 };
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 void run_batched_forward_causalmask_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
   batched_forward_causalmask_attnbias_dispatched<
-      scalar_t,
-      has_causal_mask,
-      has_attn_bias,
+      ScalarType,
+      kHasCausalMask,
+      kHasBias,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
index 774e2974c..90a8b2c59 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -51,19 +51,19 @@ extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhal
 // clang-format on
 
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 4e194c3e7..469de6c79 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -51,19 +51,19 @@ extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half
 // clang-format on
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 4da93e6d3..4b53877f3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -30,25 +30,25 @@
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 struct batched_infer_causalmask_attnbias_dispatched {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
           FmhaFwdShape<MaxK>,
           false, // kIsGroupMode
           FmhaMask,
@@ -58,7 +58,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
@@ -97,7 +97,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                 kPadSeqLenK,
                 kPadHeadDim, // kPadHeadDimQ,
                 kPadHeadDim, // kPadHeadDimV,
-                has_attn_bias,
+                kHasBias,
                 false, // kHasBiasGrad place-holder
                 false, // kStoreLSE
                 kHasDropout,
@@ -111,8 +111,8 @@ struct batched_infer_causalmask_attnbias_dispatched {
                     FmhaPipelineProblem>;
 
             using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
                 kPadSeqLenQ,
                 kPadHeadDim>>;
 
@@ -129,7 +129,7 @@ struct batched_infer_causalmask_attnbias_dispatched {
                     kPadSeqLenK,
                     true, // kPadHeadDimQ,
                     true, // kPadHeadDimV,
-                    has_attn_bias,
+                    kHasBias,
                     false, // kStoreLSE
                     kHasDropout,
                     occupancy>;
@@ -142,8 +142,8 @@ struct batched_infer_causalmask_attnbias_dispatched {
                         FmhaPipelineProblem>;
 
                 using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                    typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                    typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                    typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                    typename FmhaFwdTypeConfig<ScalarType>::ODataType,
                     true,
                     true>>;
 
@@ -225,16 +225,16 @@ struct batched_infer_causalmask_attnbias_dispatched {
 };
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 void run_batched_infer_causalmask_attnbias_dispatched(
     BatchedForwardParams& param,
     hipStream_t stream) {
   batched_infer_causalmask_attnbias_dispatched<
-      scalar_t,
-      has_causal_mask,
-      has_attn_bias,
+      ScalarType,
+      kHasCausalMask,
+      kHasBias,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index f4a2e064e..0bb91bc52 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -50,19 +50,19 @@ extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_
 // clang-format on
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 653cfacbd..9e5ebe808 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -50,19 +50,19 @@ extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t
 // clang-format on
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 0adda65cf..9a77d4f10 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -29,37 +29,37 @@
 #include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
-    bool has_bias_grad,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
+    bool kHasBiasGrad,
     ck::index_t MaxK>
 struct grouped_backward_causalmask_attnbias_dispatched {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
-      typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
-      typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
-      typename FmhaBwdTypeConfig<scalar_t>::VGradDataType>>;
+      typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::VGradDataType>>;
 
   using FmhaBwdLoadStrategy_ = typename FmhaBwdLoadStrategy<MaxK>::type;
 
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaBwdPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaBwdPipelineProblem<
-          typename FmhaBwdTypeConfig<scalar_t>::QDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::KDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::VDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::GemmDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::LSEDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::AccDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::DDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::BiasDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::RandValOutputDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::ODataType,
-          typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::QGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::KGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::VGradDataType,
-          typename FmhaBwdTypeConfig<scalar_t>::BiasGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::QDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::KDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::VDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::GemmDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::LSEDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::DDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::BiasDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::RandValOutputDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+          typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
+          typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
           FmhaBwdShape<MaxK>,
           true, // kIsGroupMode
           FmhaMask,
@@ -83,9 +83,9 @@ struct grouped_backward_causalmask_attnbias_dispatched {
 
             using FmhaBwdOGradDotOPipelineProblem =
                 ck::tile_program::block::BlockFmhaBwdOGradDotOPipelineProblem<
-                    typename FmhaBwdTypeConfig<scalar_t>::ODataType,
-                    typename FmhaBwdTypeConfig<scalar_t>::OGradDataType,
-                    typename FmhaBwdTypeConfig<scalar_t>::DDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::DDataType,
                     kBlockSize,
                     FmhaBwdShape<MaxK>::kVHeaddim,
                     true, // kIsGroupMode
@@ -108,7 +108,7 @@ struct grouped_backward_causalmask_attnbias_dispatched {
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
         constexpr ck::index_t occupancy = 1;
-        constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+        constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 
         using FmhaMask =
@@ -134,8 +134,8 @@ struct grouped_backward_causalmask_attnbias_dispatched {
               kPadSeqLenK,
               kPadHeadDim, // kPadHeadDimQ,
               kPadHeadDim, // kPadHeadDimV,
-              has_attn_bias,
-              has_bias_grad,
+              kHasBias,
+              kHasBiasGrad,
               false, // kStoreLSE
               kHasDropout,
               occupancy>;
@@ -266,18 +266,18 @@ struct grouped_backward_causalmask_attnbias_dispatched {
 };
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
-    bool has_bias_grad,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
+    bool kHasBiasGrad,
     ck::index_t MaxK>
 void run_grouped_backward_causalmask_attnbias_dispatched(
     GroupedBackwardParams& param,
     hipStream_t stream) {
   grouped_backward_causalmask_attnbias_dispatched<
-      scalar_t,
-      has_causal_mask,
-      has_attn_bias,
-      has_bias_grad,
+      ScalarType,
+      kHasCausalMask,
+      kHasBias,
+      kHasBiasGrad,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index ef2e0bb8b..8707ef38f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -55,26 +55,22 @@ extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::hal
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
   BOOL_SWITCH_2(
-      param.has_attn_bias,
-      HAS_ATTN_BIAS,
-      param.bias_has_grad,
-      HAS_BIAS_GRAD,
-      [&] {
-        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+      param.has_attn_bias, kHasBias, param.bias_has_grad, kHasBiasGrad, [&] {
+        if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_grouped_backward_causalmask_attnbias_dispatched<
                   ck::half_t,
                   false,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_grouped_backward_causalmask_attnbias_dispatched<
                   ck::half_t,
                   true,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 2e4458d7f..70beb6ff2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -28,25 +28,25 @@
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 struct grouped_forward_causalmask_attnbias_dispatched {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
           FmhaFwdShape<MaxK>,
           true, // kIsGroupMode
           FmhaMask,
@@ -56,7 +56,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
@@ -87,7 +87,7 @@ struct grouped_forward_causalmask_attnbias_dispatched {
                 kPadSeqLenK,
                 kPadHeadDimQ,
                 kPadHeadDimV,
-                has_attn_bias,
+                kHasBias,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
                 kHasDropout,
@@ -101,8 +101,8 @@ struct grouped_forward_causalmask_attnbias_dispatched {
                     FmhaPipelineProblem>;
 
             using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
                 kPadSeqLenQ,
                 kPadHeadDimV>>;
 
@@ -178,16 +178,16 @@ struct grouped_forward_causalmask_attnbias_dispatched {
 };
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 void run_grouped_forward_causalmask_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
   grouped_forward_causalmask_attnbias_dispatched<
-      scalar_t,
-      has_causal_mask,
-      has_attn_bias,
+      ScalarType,
+      kHasCausalMask,
+      kHasBias,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
index 9789cee29..d49d7ccf6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -51,19 +51,19 @@ extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhal
 // clang-format on
 
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index d49eaa5cc..f0ca8a102 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -51,19 +51,19 @@ extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half
 // clang-format on
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_forward_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 5c44c772c..53e70420c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -29,25 +29,25 @@
 #include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 struct grouped_infer_causalmask_attnbias_dispatched {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<scalar_t>::QDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::KDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::VDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::BiasDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::LSEDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::PDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-          typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
           FmhaFwdShape<MaxK>,
           true, // kIsGroupMode
           FmhaMask,
@@ -57,7 +57,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-      constexpr bool has_masking = has_causal_mask || USE_LOCAL_ATTENTION;
+      constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
       const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
@@ -87,7 +87,7 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                 kPadSeqLenK,
                 kPadHeadDimQ,
                 kPadHeadDimV,
-                has_attn_bias,
+                kHasBias,
                 false, // kHasBiasGrad place-holder
                 false, // kStoreLSE
                 kHasDropout,
@@ -101,8 +101,8 @@ struct grouped_infer_causalmask_attnbias_dispatched {
                     FmhaPipelineProblem>;
 
             using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<scalar_t>::OaccDataType,
-                typename FmhaFwdTypeConfig<scalar_t>::ODataType,
+                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
                 kPadSeqLenQ,
                 kPadHeadDimV>>;
 
@@ -176,16 +176,16 @@ struct grouped_infer_causalmask_attnbias_dispatched {
 };
 
 template <
-    typename scalar_t,
-    bool has_causal_mask,
-    bool has_attn_bias,
+    typename ScalarType,
+    bool kHasCausalMask,
+    bool kHasBias,
     ck::index_t MaxK>
 void run_grouped_infer_causalmask_attnbias_dispatched(
     GroupedForwardParams& param,
     hipStream_t stream) {
   grouped_infer_causalmask_attnbias_dispatched<
-      scalar_t,
-      has_causal_mask,
-      has_attn_bias,
+      ScalarType,
+      kHasCausalMask,
+      kHasBias,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index 7ee53261d..ccb7e0e6f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -50,19 +50,19 @@ extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_
 // clang-format on
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::bhalf_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 2d03119db..881810868 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -50,19 +50,19 @@ extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t
 // clang-format on
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, HAS_ATTN_BIAS, [&] {
+  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             false,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_infer_causalmask_attnbias_dispatched<
             ck::half_t,
             true,
-            HAS_ATTN_BIAS,
+            kHasBias,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");

From 411ccd63bfb32a0f0437a2f123078e7cd48dcae3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 15 Apr 2024 00:11:38 +0000
Subject: [PATCH 523/641] Simplify the names of the dispatch class and
 interfaces

---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h |  6 +--
 .../ck_tiled_fmha_batched_backward_bp16.cpp   | 40 +++++++++----------
 .../ck_tiled_fmha_batched_backward_fp16.cpp   | 40 +++++++++----------
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  |  6 +--
 .../ck_tiled_fmha_batched_forward_bp16.cpp    | 36 ++++++++---------
 .../ck_tiled_fmha_batched_forward_fp16.cpp    | 36 ++++++++---------
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |  6 +--
 .../ck_tiled_fmha_batched_infer_bp16.cpp      | 36 ++++++++---------
 .../ck_tiled_fmha_batched_infer_fp16.cpp      | 36 ++++++++---------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h |  6 +--
 .../ck_tiled_fmha_grouped_backward_bp16.cpp   | 40 +++++++++----------
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   | 40 +++++++++----------
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  6 +--
 .../ck_tiled_fmha_grouped_forward_bp16.cpp    | 36 ++++++++---------
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    | 36 ++++++++---------
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |  6 +--
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      | 36 ++++++++---------
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      | 36 ++++++++---------
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...ask_has_attnbias_has_biasgrad_maxk_128.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_32.cpp |  2 +-
 ...mask_has_attnbias_has_biasgrad_maxk_64.cpp |  2 +-
 ...mask_has_attnbias_no_biasgrad_maxk_128.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_32.cpp |  2 +-
 ...lmask_has_attnbias_no_biasgrad_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...bp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...6_has_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...16_has_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...16_has_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...p16_has_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_128.cpp |  2 +-
 ...16_no_causalmask_has_attnbias_maxk_256.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_32.cpp |  2 +-
 ...p16_no_causalmask_has_attnbias_maxk_64.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_128.cpp |  2 +-
 ...p16_no_causalmask_no_attnbias_maxk_256.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_32.cpp |  2 +-
 ...fp16_no_causalmask_no_attnbias_maxk_64.cpp |  2 +-
 218 files changed, 442 insertions(+), 442 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 9af5bf1c3..0316907ae 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -34,7 +34,7 @@ template <
     bool kHasBias,
     bool kHasBiasGrad,
     ck::index_t MaxK>
-struct batched_backward_causalmask_attnbias_dispatched {
+struct batched_backward_causalmask_bias_dispatch {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
       typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
@@ -284,10 +284,10 @@ template <
     bool kHasBias,
     bool kHasBiasGrad,
     ck::index_t MaxK>
-void run_batched_backward_causalmask_attnbias_dispatched(
+void run_batched_backward_causalmask_bias_dispatch(
     BatchedBackwardParams& param,
     hipStream_t stream) {
-  batched_backward_causalmask_attnbias_dispatched<
+  batched_backward_causalmask_bias_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
index 8d0445ddf..db2b56742 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
@@ -13,43 +13,43 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -59,14 +59,14 @@ void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
         if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_batched_backward_causalmask_attnbias_dispatched<
+              run_batched_backward_causalmask_bias_dispatch<
                   ck::bhalf_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_batched_backward_causalmask_attnbias_dispatched<
+              run_batched_backward_causalmask_bias_dispatch<
                   ck::bhalf_t,
                   true,
                   kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index a0d0cca7d..462309435 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -13,43 +13,43 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -59,14 +59,14 @@ void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
         if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_batched_backward_causalmask_attnbias_dispatched<
+              run_batched_backward_causalmask_bias_dispatch<
                   ck::half_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_batched_backward_causalmask_attnbias_dispatched<
+              run_batched_backward_causalmask_bias_dispatch<
                   ck::half_t,
                   true,
                   kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index ee45f3631..79f6eceb6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -33,7 +33,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-struct batched_forward_causalmask_attnbias_dispatched {
+struct batched_forward_causalmask_bias_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -230,10 +230,10 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-void run_batched_forward_causalmask_attnbias_dispatched(
+void run_batched_forward_causalmask_bias_dispatch(
     BatchedForwardParams& param,
     hipStream_t stream) {
-  batched_forward_causalmask_attnbias_dispatched<
+  batched_forward_causalmask_bias_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
index 90a8b2c59..6dad19459 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -13,40 +13,40 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -54,13 +54,13 @@ void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_forward_causalmask_attnbias_dispatched<
+        run_batched_forward_causalmask_bias_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_forward_causalmask_attnbias_dispatched<
+        run_batched_forward_causalmask_bias_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 469de6c79..73cd2e7fe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -13,40 +13,40 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -54,13 +54,13 @@ void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_forward_causalmask_attnbias_dispatched<
+        run_batched_forward_causalmask_bias_dispatch<
             ck::half_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_forward_causalmask_attnbias_dispatched<
+        run_batched_forward_causalmask_bias_dispatch<
             ck::half_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 4b53877f3..eb65e7aba 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -34,7 +34,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-struct batched_infer_causalmask_attnbias_dispatched {
+struct batched_infer_causalmask_bias_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -229,10 +229,10 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-void run_batched_infer_causalmask_attnbias_dispatched(
+void run_batched_infer_causalmask_bias_dispatch(
     BatchedForwardParams& param,
     hipStream_t stream) {
-  batched_infer_causalmask_attnbias_dispatched<
+  batched_infer_causalmask_bias_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index 0bb91bc52..9a14373ad 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -12,40 +12,40 @@
 #include "ck_tiled_fmha_batched_infer.h"
 
 // clang-format off
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -53,13 +53,13 @@ void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_infer_causalmask_attnbias_dispatched<
+        run_batched_infer_causalmask_bias_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_infer_causalmask_attnbias_dispatched<
+        run_batched_infer_causalmask_bias_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 9e5ebe808..d2f8e7bfe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -12,40 +12,40 @@
 #include "ck_tiled_fmha_batched_infer.h"
 
 // clang-format off
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -53,13 +53,13 @@ void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_infer_causalmask_attnbias_dispatched<
+        run_batched_infer_causalmask_bias_dispatch<
             ck::half_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_infer_causalmask_attnbias_dispatched<
+        run_batched_infer_causalmask_bias_dispatch<
             ck::half_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 9a77d4f10..264cafa1c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -34,7 +34,7 @@ template <
     bool kHasBias,
     bool kHasBiasGrad,
     ck::index_t MaxK>
-struct grouped_backward_causalmask_attnbias_dispatched {
+struct grouped_backward_causalmask_bias_dispatch {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
       typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
@@ -271,10 +271,10 @@ template <
     bool kHasBias,
     bool kHasBiasGrad,
     ck::index_t MaxK>
-void run_grouped_backward_causalmask_attnbias_dispatched(
+void run_grouped_backward_causalmask_bias_dispatch(
     GroupedBackwardParams& param,
     hipStream_t stream) {
-  grouped_backward_causalmask_attnbias_dispatched<
+  grouped_backward_causalmask_bias_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
index 10337fcd2..f0164e470 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
@@ -13,43 +13,43 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -63,14 +63,14 @@ void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
         if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_grouped_backward_causalmask_attnbias_dispatched<
+              run_grouped_backward_causalmask_bias_dispatch<
                   ck::bhalf_t,
                   false,
                   HAS_ATTN_BIAS,
                   HAS_BIAS_GRAD,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_grouped_backward_causalmask_attnbias_dispatched<
+              run_grouped_backward_causalmask_bias_dispatch<
                   ck::bhalf_t,
                   true,
                   HAS_ATTN_BIAS,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index 8707ef38f..7703b742c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -13,43 +13,43 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_attnbias_dispatched<ck::half_t, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -59,14 +59,14 @@ void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
         if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_grouped_backward_causalmask_attnbias_dispatched<
+              run_grouped_backward_causalmask_bias_dispatch<
                   ck::half_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_grouped_backward_causalmask_attnbias_dispatched<
+              run_grouped_backward_causalmask_bias_dispatch<
                   ck::half_t,
                   true,
                   kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 70beb6ff2..345c8fe35 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -32,7 +32,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-struct grouped_forward_causalmask_attnbias_dispatched {
+struct grouped_forward_causalmask_bias_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -182,10 +182,10 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-void run_grouped_forward_causalmask_attnbias_dispatched(
+void run_grouped_forward_causalmask_bias_dispatch(
     GroupedForwardParams& param,
     hipStream_t stream) {
-  grouped_forward_causalmask_attnbias_dispatched<
+  grouped_forward_causalmask_bias_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
index d49d7ccf6..50e3bac62 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -13,40 +13,40 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -54,13 +54,13 @@ void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_forward_causalmask_attnbias_dispatched<
+        run_grouped_forward_causalmask_bias_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_forward_causalmask_attnbias_dispatched<
+        run_grouped_forward_causalmask_bias_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index f0ca8a102..f566a6d2c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -13,40 +13,40 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -54,13 +54,13 @@ void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_forward_causalmask_attnbias_dispatched<
+        run_grouped_forward_causalmask_bias_dispatch<
             ck::half_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_forward_causalmask_attnbias_dispatched<
+        run_grouped_forward_causalmask_bias_dispatch<
             ck::half_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 53e70420c..0d976de97 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -33,7 +33,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-struct grouped_infer_causalmask_attnbias_dispatched {
+struct grouped_infer_causalmask_bias_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -180,10 +180,10 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     ck::index_t MaxK>
-void run_grouped_infer_causalmask_attnbias_dispatched(
+void run_grouped_infer_causalmask_bias_dispatch(
     GroupedForwardParams& param,
     hipStream_t stream) {
-  grouped_infer_causalmask_attnbias_dispatched<
+  grouped_infer_causalmask_bias_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index ccb7e0e6f..c76c6e6f8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -12,40 +12,40 @@
 #include "ck_tiled_fmha_grouped_infer.h"
 
 // clang-format off
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::bhalf_t, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -53,13 +53,13 @@ void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_infer_causalmask_attnbias_dispatched<
+        run_grouped_infer_causalmask_bias_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_infer_causalmask_attnbias_dispatched<
+        run_grouped_infer_causalmask_bias_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 881810868..4e4a1c101 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -12,40 +12,40 @@
 #include "ck_tiled_fmha_grouped_infer.h"
 
 // clang-format off
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_attnbias_dispatched<ck::half_t, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -53,13 +53,13 @@ void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
   BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_infer_causalmask_attnbias_dispatched<
+        run_grouped_infer_causalmask_bias_dispatch<
             ck::half_t,
             false,
             kHasBias,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_infer_causalmask_attnbias_dispatched<
+        run_grouped_infer_causalmask_bias_dispatch<
             ck::half_t,
             true,
             kHasBias,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index 23dcdbd74..f6bf4bd6f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index cea2dc49f..0514bf28a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index ebf213e77..ee19b37de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index 4154b0e51..8ab4f4229 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index c6ef4a6ad..75966fb73 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 5ea0440a9..07dc496fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
index 390c057a2..736256e63 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
index 6d9e8db05..c44a2f99e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
index f37923f72..3d9272061 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index 410a00133..484d96a41 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index 0eb83776e..8f22808ad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index 30a9d3e06..e173fd0cb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index 2dc4036ab..395d187a7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index b634ec861..89a5c0624 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 572667e05..09f17fb59 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index fd19dba04..11023b667 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index 2abde7a13..1ca23fead 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index 392e0df61..f71dedaaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index 3d03144e1..cb146d6c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index 130922e0d..32b7d5373 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index 974fe1752..42e57c6a8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index b611084db..442263f0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index a0156e2c4..9d20c01c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 2685736f4..95d62e3da 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
index b2b0d96f9..074f41cdb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
index 4b63b34e4..cea3242f4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
index c7e2c84b3..50687e28b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index d6e30d22a..94477c6a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index f46573924..2dc072271 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index fc7974038..abb6f7933 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index 0d8369353..3f2b9ddec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index 043d4357c..77395133b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 48013f08d..1bb5433e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index ad1018234..f0e4e22af 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index ed71783b8..fc49a0182 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index 35bb6ac5f..8deba9920 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_attnbias_dispatched<
+template void run_batched_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
index 6ea24c5ca..0e2eced3c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
index a675c95be..0ee352e55 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
index dc4bb0ea0..3ce3f2fd3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
index 334eb891f..11674e05d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
index 59c6550f4..51996f5f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
index a30775e77..078bfe33f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
index 594c4a68c..c6c070287 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
index 39ea42913..235c706f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
index ed91bf4bf..99f9d3dc4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
index eca859229..2edf0c9b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
index ec258aeda..00e19f71d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
index feb78a115..529837a27 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index 1482336ab..a7aec2f1a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
index f1ba383da..d99707cf3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index 3b9f3026b..f723ed872 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index c38716ce2..5d0095c4f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
index 58013ca64..c8b985564 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
index fcb6d8b54..e0beb8f59 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
index 38e7fb026..a58be730c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
index 1c0b277b7..5ef660d35 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
index 070ed44ef..c12bcafdb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
index e535f40f3..00aacf534 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
index a24884bff..9e2963e42 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
index 524e1ab86..93972071b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
index c2c124dbe..3c6aa04c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
index 1cdd7e078..bb1126829 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
index 50ea22659..6911476cb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
index 58ac17e39..f9aaf8a71 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index 606d9db86..c1d701e6e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
index 7dc799605..01435a301 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index 566b1bf6a..e499377ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index 3b72b97d1..8cf6fe551 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_attnbias_dispatched<
+template void run_batched_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
index ecc90b366..e5fab05c4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
index dff3a317a..a3c8f6bca 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
index fa084941b..3fc855dbb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
index d0ece69d0..5573d58b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
index 9757278db..87f5c89ff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
index 6caed9563..3935893ca 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
index 4dfaa3678..b4a4a9fa7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
index fa0416c5c..e05151545 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
index 4772d56ab..2f7ff1124 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
index b95f0d5ae..a17c6fabb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
index 7fe7a3f69..d4021ed8d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
index 3ae773369..035923269 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index b95c3fdb9..9251bccc5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
index dce1496ea..e113097a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index fa81f80c1..0241586ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index fd118cd22..290d6b145 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
index 1ae833e7d..5aba53e37 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
index bb9a177b5..0d653b4e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
index 88945231f..657708501 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
index 330e0dfbc..666488a9f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
index 8caa116d8..47d1f4e51 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
index 0468ba8af..2c1779293 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
index cd8077b51..90138a271 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
index ed22d8fc5..10396a224 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
index 2f16639ed..21d46d793 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
index 41f8249e9..14d14ce8c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
index bfdf01423..85eed5de9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
index 550831036..00de9f3ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index 8e9843a5e..42edd42a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
index 20580c11e..078a28eb7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index 4e4d90f82..f3791d766 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index b36864534..23b8796ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_attnbias_dispatched<
+template void run_batched_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index ccf93c6eb..06974cabb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index 571012eba..7bc1dafae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index 7f4c7a6c0..e08c2d2a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index 1b045b39b..7f745b005 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index 68bb20d86..8bdecd02e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 6fab84344..b68b7f0f1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
index 26146e7b9..bd728f967 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
index eec45177f..1daa01062 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
index f55ada6a4..42f675373 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index 9ab625aed..fcd672d9e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index a8a3c66fd..18151b2ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index 29ec58440..f7f164720 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index a703e7b1b..4c81b91d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index a57d05f37..4ea3986a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 4dd74235e..67caf36b2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index 7e92e2be5..44e53a806 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index 27e119c5c..9034115fe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index b2149eafb..25e2ba32a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index d32f76ef3..fb50648f4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index b3cf3fa5c..a3e58ba19 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index 6b6fe1383..445f59fb5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index a082bcb80..0e6209988 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index 59165bbe8..01d441c5f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index cbf262e7a..c332b580a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
index 3b0cd4b76..1b61d184a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
index e3055cffe..d8ddfbb5d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
index 1d2ae1a98..4664327ca 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
index 2904aa886..bbfe4fc48 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
index 75680aad1..b0eea03c4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
index d7625e4dc..035e4c43e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
index e25e0c755..f4a38dab8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
index 18e9ea80d..a6c364146 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
index 23e7cd1e5..f45d7495f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index 1a59b5a0a..440c1b41a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index 7689feaac..cc2945436 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index 89b2ab475..00b2f08d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_attnbias_dispatched<
+template void run_grouped_backward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
index 785e62d78..6b74ac612 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
index 83001360b..d973d299b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
index ed45ccf36..3ff1b2901 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
index f0b639ef6..1347d1da8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
index 697ce6345..b4320968e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
index cc24c03c0..7654c11cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
index e0d0f9e03..dd8ee2879 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
index c658c89f2..0da1dbf1a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
index ebd002ef4..5a078f4ad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
index 844444629..cdb13030e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
index 52b5cb895..344307c4c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
index 35a058368..a8604cd7c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index d278e2b0b..d9e339266 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
index 2bd6d042a..339c05c01 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index 732381a8a..9c600d90e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index 352d94bb4..c2ecf7d9d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
index c83769098..6f1a866bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
index fe21d52fe..60a5f9444 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
index 6bedae2d2..c549aec61 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
index a45a99b80..8198f3beb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
index e0349f471..d5fa2c40c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
index 58d7cec79..ecb005898 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
index a9a2a191e..53ce1e962 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
index 8eb2447a8..80a645aa8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
index c7ba7f09e..3bbfba9ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
index 577f1a1ae..59d0142be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
index cd1bda5d1..503b4c245 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
index caa6f0d16..f63f6b44d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index 08bf47cd5..dd27d65c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
index 8c4c0c440..a945f4190 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index 2ff6c73e7..03c98bd6f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index b5ec1a781..451004e3b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_attnbias_dispatched<
+template void run_grouped_forward_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
index fe5b8db51..dacb2b5ff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
index 593d4fda1..49faae0f1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
index 941dcd50e..79f83bbd6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
index 82183313a..965428a41 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
index 2f8ea04e7..ffd8ac153 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
index f10999c7c..46495e5f0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
index f87772024..c52e17f7b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
index d2b85141c..5bf323e37 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
index 35b522a6a..bfb0a8aab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
index 4fb8bdd59..4a4298a24 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
index 1d2cd2656..2584fcf0b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
index 2ccb25769..a8197825f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
index 54cbec7ec..d409b257b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
index 12b67ea45..8022f3e25 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
index d6c6c1a5d..a8ab2616c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
index c74dbe200..d0c3b76d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::bhalf_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
index 8fe0d31e7..3e0acc63a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
index aeff1e2c6..f17c72caa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
index f8fed7106..be812c79f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
index ec5f029d7..360180c84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
index 5449dfd32..ea0f83842 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
index 73bf0e6d6..8647f8273 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
index 55c80b4c9..28a808522 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
index 76cafe4e0..888f0f8e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
index 1741265b2..238ef6acb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
index 4197ba831..6819de0a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
index 88ac7b42c..3ab3cc5c2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
index c717aed64..7470f5b12 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
index c3f52f074..7226ed616 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
index 5d4882d2b..c8ec9fcdd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
index 6e0b2914d..80e7378e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
index b49d09908..826b22356 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
@@ -8,7 +8,7 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_attnbias_dispatched<
+template void run_grouped_infer_causalmask_bias_dispatch<
     ck::half_t,
     false,
     false,

From 812a529ac8cc2b36ca5383727127217aaf66ae2b Mon Sep 17 00:00:00 2001
From: "Qianfeng.Zhang" <Qianfeng.Zhang@amd.com>
Date: Tue, 16 Apr 2024 06:28:08 +0000
Subject: [PATCH 524/641] Changes to reuse the kernel files under ck_tile
 examples/91_tile_program/fmha folder

---
 .gitmodules                                                 | 2 +-
 setup.py                                                    | 2 +-
 third_party/composable_kernel_tiled                         | 2 +-
 .../csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp   | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_backward.h     | 6 +++---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h | 6 +++---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h   | 6 +++---
 .../attention/hip_fmha/ck_tiled_fmha_grouped_backward.h     | 6 +++---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h | 6 +++---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h   | 6 +++---
 10 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 8d80ded0b..6a58ce8c2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel-internal.git
-	branch = ck_tile/opt_padding_fa_train_xformers
+	branch = ck_tile/opt_padding_fa_train_pr
diff --git a/setup.py b/setup.py
index e909188c8..9053e6dd2 100644
--- a/setup.py
+++ b/setup.py
@@ -357,7 +357,7 @@ def get_extensions():
             / "composable_kernel_tiled"
             / "example"
             / "91_tile_program"
-            / "xformers_fmha"
+            / "fmha"
         ]
 
         include_dirs += [
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 131f660b2..bbf7e3d0a 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 131f660b24c450f819f1ebe4698afcbe6155d9b9
+Subproject commit bbf7e3d0a4c550e54d383d8214c087d2fc184205
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index 2f55d425a..f751e751e 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -16,7 +16,7 @@
 #include <ck/ck.hpp>
 #include <ck/host_utility/kernel_launch.hpp>
 
-#include "ck_tiled_fmha_rand_uniform_kernel.hpp"
+#include "fmha_rand_uniform_kernel.hpp"
 
 namespace {
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 0316907ae..f84eb306b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -24,9 +24,9 @@
 #include "ck_tiled_fmha_bwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "ck_tiled_fmha_backward_kernel.hpp"
-#include "ck_tiled_fmha_bwd_epilogue.hpp"
-#include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
+#include "fmha_bwd_kernel.hpp"
+#include "fmha_bwd_epilogue.hpp"
+#include "fmha_bwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 79f6eceb6..de7631449 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -24,9 +24,9 @@
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "ck_tiled_fmha_forward_kernel.hpp"
-#include "ck_tiled_fmha_fwd_epilogue.hpp"
-#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+#include "fmha_fwd_kernel.hpp"
+#include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index eb65e7aba..b99fb7afc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -25,9 +25,9 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "ck_tiled_fmha_forward_kernel.hpp"
-#include "ck_tiled_fmha_fwd_epilogue.hpp"
-#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+#include "fmha_fwd_kernel.hpp"
+#include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 264cafa1c..c0b54ece8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -24,9 +24,9 @@
 #include "ck_tiled_fmha_bwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "ck_tiled_fmha_backward_kernel.hpp"
-#include "ck_tiled_fmha_bwd_epilogue.hpp"
-#include "ck_tiled_fmha_bwd_tile_partitioner.hpp"
+#include "fmha_bwd_kernel.hpp"
+#include "fmha_bwd_epilogue.hpp"
+#include "fmha_bwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 345c8fe35..c50f50e7c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -23,9 +23,9 @@
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "ck_tiled_fmha_forward_kernel.hpp"
-#include "ck_tiled_fmha_fwd_epilogue.hpp"
-#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+#include "fmha_fwd_kernel.hpp"
+#include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 0d976de97..af5d9588b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -24,9 +24,9 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "ck_tiled_fmha_forward_kernel.hpp"
-#include "ck_tiled_fmha_fwd_epilogue.hpp"
-#include "ck_tiled_fmha_fwd_tile_partitioner.hpp"
+#include "fmha_fwd_kernel.hpp"
+#include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,

From 51b4223749320c6ff39060e59917e2388bbb3ff7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 16 Apr 2024 15:55:55 +0000
Subject: [PATCH 525/641] Update test_mem_eff_attention.py for
 test_dropout/test_dropout_backward/test_backward on rocm

---
 tests/test_mem_eff_attention.py | 50 ++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 1d166b336..0c623e8eb 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -969,6 +969,16 @@ def test_backward(
         if op_bw != fmha.cutlass.BwOp
         else fmha.cutlass.FwOp
     )
+
+    if op_bw == fmha.ck.BwOp:
+        op_fwd = fmha.ck.FwOp
+        if dtype == torch.bfloat16:
+            pytest.skip("CK Fmha backward for bfloat16 currently is not very accurate for some cases!")
+        if grad_out_contiguous == False:
+            pytest.skip("CK Fmha does not support contiguous layout for grad_out!")
+        if k % 2 != 0:
+            pytest.skip("CK Fmha currently requires the headdim size of query input be an even value!")
+
     qkv = None
 
     if (
@@ -1106,6 +1116,12 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
         rand_uniform = torch.ops.xformers._cutlass_rand_uniform(p, mask)
         mask = (rand_uniform > p).to(torch.float32)
         mask = mask.reshape(batch_size, q_len, kv_len)
+    elif op == fmha.ck.FwOp:
+        mask = torch.empty((batch_size, 1, q_len, kv_len), device=device)
+        # rand_uniform is an int8_t tensor
+        rand_uniform = torch.ops.xformers._ck_rand_uniform(p, mask)
+        mask = (rand_uniform <= int((1.0 - p) * 255.0)).to(torch.float32)
+        mask = mask.reshape(batch_size, q_len, kv_len)
     else:
         mask = torch.empty((batch_size, q_len, kv_len), device=device)
         mask = torch.ops.xformers._temp_dropout(mask, p)
@@ -1125,9 +1141,14 @@ def _get_drop_mask(op, batch_size, q_len, kv_len, p, device):
 def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     device = "cuda"
     scale = 3
-    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
-    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
-    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    dtype=torch.float
+    if torch.version.hip and op == fmha.ck.FwOp:
+        dtype=torch.float16
+
+    query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
 
     inputs_for_support_check = fmha.Inputs(query, key, value, attn_bias, p, None)
     if not op.supports(inputs_for_support_check):
@@ -1149,7 +1170,11 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     torch.manual_seed(seed)
     mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
     ref = ref_attention(query, key, value, attn_bias, mask, p)
-    assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
+
+    if dtype is torch.float:
+        assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
+    else:
+        assert_allclose(out.float(), ref, atol=2.2e-2), f"{(out - ref).abs().max()}"
 
     num_trials = 1000
     p_val_tol = 1e-6
@@ -1267,6 +1292,23 @@ def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
         dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
     )
 
+cuda_only
+@pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
+@pytest.mark.parametrize("k", [16, 64, 128])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("kv_len", [3, 248, 256])
+@pytest.mark.parametrize("q_len", [3, 248, 256])
+@pytest.mark.parametrize("dt", ["f16"])
+def test_dropout_backward_ck(dt, q_len, kv_len, batch_size, k, p):
+    _test_dropout_backward(
+        q_len,
+        kv_len,
+        batch_size,
+        k,
+        p,
+        op=fmha.ck.FwOp,
+        dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
+    )
 
 @cuda_only
 @disable_on_rocm

From d10ef791f131ce179e37f554862539943e882768 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 16 Apr 2024 16:32:34 +0000
Subject: [PATCH 526/641] Tiny change to the philox_cuda_state input setting

---
 xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp | 3 ++-
 .../attention/hip_fmha/attention_forward_generic_ck_tiled.cpp  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index f751e751e..b3e241844 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -43,7 +43,8 @@ at::Tensor rand_uniform_int(
   at::PhiloxCudaState rng_engine_inputs;
   {
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N);
+    rng_engine_inputs =
+        gen->philox_cuda_state((B + 3) * (num_heads + 1) * (M + 1) * (N + 1));
   }
 
   const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 48d37357b..ba2fb56b7 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -132,7 +132,8 @@ efficient_attention_forward_ck(
     std::lock_guard<std::mutex> lock(gen->mutex_);
     // if using dropout, we produce 1 random number for each element of the
     // attention tensor
-    rng_engine_inputs = gen->philox_cuda_state(B * Hq * M * N);
+    rng_engine_inputs =
+        gen->philox_cuda_state((B + 3) * (Hq + 1) * (M + 1) * (N + 1));
 
     const auto seeds = at::cuda::philox::unpack(rng_engine_inputs);
 

From 25bd72046d64ef7a241799bb2350c49501caca7e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 16 Apr 2024 18:00:43 +0000
Subject: [PATCH 527/641] Allocate logsumexp to ensure aligned access by each
 thread-group

---
 .../hip_fmha/attention_backward_generic_ck_tiled.cpp          | 4 ++--
 .../attention/hip_fmha/attention_forward_generic_ck_tiled.cpp | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index ac4bceeef..01d9ba0a8 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -207,7 +207,7 @@ efficient_attention_backward_ck(
 
     TORCH_CHECK(p.B == logsumexp.size(0));
     TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.M == logsumexp.size(2));
+    TORCH_CHECK(p.M <= logsumexp.size(2));
 
     if (scale.has_value()) {
       p.scale = float(*scale);
@@ -333,7 +333,7 @@ efficient_attention_backward_ck(
 
     TORCH_CHECK(p.num_batches == logsumexp.size(0));
     TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.max_seqlen_q == logsumexp.size(2));
+    TORCH_CHECK(p.max_seqlen_q <= logsumexp.size(2));
 
     if (scale.has_value())
       p.scale = float(*scale);
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index ba2fb56b7..de1e65dc2 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -346,8 +346,10 @@ efficient_attention_forward_ck(
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
+      // align the access of logsumexp by each thread-group in cache-line size
+      int aligned_seqlen_q = (p.max_seqlen_q + 15) / 16 * 16;
       logsumexp = at::empty(
-          {p.num_batches, Hq, p.max_seqlen_q}, opts.dtype(at::kFloat));
+          {p.num_batches, Hq, aligned_seqlen_q}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
       p.lse_strides = {
           static_cast<int>(logsumexp.stride(0)),

From abfdc27c212d4c9d48ff65db9e2c74c364cae344 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 16 Apr 2024 18:06:21 +0000
Subject: [PATCH 528/641] Add checking for query/key headdim size
 attention_backward_generic

---
 .../hip_fmha/attention_backward_generic_ck_tiled.cpp          | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 01d9ba0a8..2fe1150dc 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -122,6 +122,10 @@ efficient_attention_backward_ck(
   int64_t K = query.size(3);
   int64_t Kv = value.size(3);
 
+  if (K % 2 != 0)
+    throw std::runtime_error(
+        "Currently CK Fmha requires the headdim of query/key be an even value!");
+
   auto opts = query.options();
 
   at::Tensor grad_q, grad_k, grad_v, grad_bias;

From ff953674421e8097dc3a1dd2c55a2dbd8440f100 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 22 Apr 2024 15:21:46 +0000
Subject: [PATCH 529/641] Using ck_tile/opt_padding_fa_train_pr2 and
 synchronize the backward codes with the changes

---
 .gitmodules                                   |  2 +-
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 20 +++++++++----------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 20 +++++++++----------
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 6a58ce8c2..325ca5fbf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel-internal.git
-	branch = ck_tile/opt_padding_fa_train_pr
+	branch = ck_tile/opt_padding_fa_train_pr2
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index bbf7e3d0a..f949afaea 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit bbf7e3d0a4c550e54d383d8214c087d2fc184205
+Subproject commit f949afaea4abfc426676b7b9cb7e931664f9b5e8
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index f84eb306b..904cd930e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -24,8 +24,8 @@
 #include "ck_tiled_fmha_bwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_bwd_kernel.hpp"
 #include "fmha_bwd_epilogue.hpp"
+#include "fmha_bwd_kernel.hpp"
 #include "fmha_bwd_tile_partitioner.hpp"
 
 template <
@@ -150,12 +150,12 @@ struct batched_backward_causalmask_bias_dispatch {
                   FmhaBwdLoadStrategy_,
                   FmhaBwdPipelineProblem>::BlockPipeline;
 
-          using FmhaBwdKernel_ = FmhaBwdKernel<
+          using FmhaBwdQKVGradKernel_ = FmhaBwdQKVGradKernel<
               FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
               FmhaBwdEpilogue_>;
 
-          RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
+          RunWithBwdQKVGradKernel<FmhaBwdQKVGradKernel_>(param, stream);
         });
       });
     };
@@ -197,12 +197,12 @@ struct batched_backward_causalmask_bias_dispatch {
         kargs);
   }
 
-  template <typename FmhaBwdKernel>
-  static void RunWithBwdKernel(
+  template <typename FmhaBwdQKVGradKernel>
+  static void RunWithBwdQKVGradKernel(
       BatchedBackwardParams& param,
       hipStream_t stream) {
     const auto kargs = [&] {
-      return FmhaBwdKernel::MakeKargs(
+      return FmhaBwdQKVGradKernel::MakeKargs(
           param.q_ptr,
           param.k_ptr,
           param.v_ptr,
@@ -264,13 +264,13 @@ struct batched_backward_causalmask_bias_dispatch {
           {param.philox_seed, param.philox_offset});
     }();
 
-    dim3 kGridSize = FmhaBwdKernel::GridSize(param.B, param.Hq, param.N);
-    constexpr dim3 kBlockSize = FmhaBwdKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdKernel::kBlockPerCu;
+    dim3 kGridSize = FmhaBwdQKVGradKernel::GridSize(param.B, param.Hq, param.N);
+    constexpr dim3 kBlockSize = FmhaBwdQKVGradKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdQKVGradKernel::kBlockPerCu;
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
-        FmhaBwdKernel{},
+        FmhaBwdQKVGradKernel{},
         kGridSize,
         kBlockSize,
         0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index c0b54ece8..c61cf11bc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -24,8 +24,8 @@
 #include "ck_tiled_fmha_bwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_bwd_kernel.hpp"
 #include "fmha_bwd_epilogue.hpp"
+#include "fmha_bwd_kernel.hpp"
 #include "fmha_bwd_tile_partitioner.hpp"
 
 template <
@@ -148,12 +148,12 @@ struct grouped_backward_causalmask_bias_dispatch {
                   FmhaBwdLoadStrategy_,
                   FmhaBwdPipelineProblem>::BlockPipeline;
 
-          using FmhaBwdKernel_ = FmhaBwdKernel<
+          using FmhaBwdQKVGradKernel_ = FmhaBwdQKVGradKernel<
               FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
               FmhaBwdEpilogue_>;
 
-          RunWithBwdKernel<FmhaBwdKernel_>(param, stream);
+          RunWithBwdQKVGradKernel<FmhaBwdQKVGradKernel_>(param, stream);
         });
       });
     };
@@ -193,12 +193,12 @@ struct grouped_backward_causalmask_bias_dispatch {
         kargs);
   }
 
-  template <typename FmhaBwdKernel>
-  static void RunWithBwdKernel(
+  template <typename FmhaBwdQKVGradKernel>
+  static void RunWithBwdQKVGradKernel(
       GroupedBackwardParams& param,
       hipStream_t stream) {
     const auto kargs = [&] {
-      return FmhaBwdKernel::MakeKargs(
+      return FmhaBwdQKVGradKernel::MakeKargs(
           param.q_ptr,
           param.k_ptr,
           param.v_ptr,
@@ -250,14 +250,14 @@ struct grouped_backward_causalmask_bias_dispatch {
           {param.philox_seed, param.philox_offset});
     }();
 
-    dim3 kGridSize = FmhaBwdKernel::GridSize(
+    dim3 kGridSize = FmhaBwdQKVGradKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_k);
-    constexpr dim3 kBlockSize = FmhaBwdKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdKernel::kBlockPerCu;
+    constexpr dim3 kBlockSize = FmhaBwdQKVGradKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdQKVGradKernel::kBlockPerCu;
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
-        FmhaBwdKernel{},
+        FmhaBwdQKVGradKernel{},
         kGridSize,
         kBlockSize,
         0,

From 93469ab1c10afd6ef6851b8d36cb5807706b103b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 22 Apr 2024 15:23:38 +0000
Subject: [PATCH 530/641] Enable using async pipeline in the batched inference
 path for performance

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 150 +++++++++---------
 1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index b99fb7afc..2b43cb677 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -25,8 +25,8 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_tile_partitioner.hpp"
 
 template <
@@ -81,80 +81,80 @@ struct batched_infer_causalmask_bias_dispatch {
       const bool use_async_pipeline =
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
-      /*     if (!use_async_pipeline) { */
-      BOOL_SWITCH_4(
-          has_dropout,
-          kHasDropout,
-          pad_seqlen_q,
-          kPadSeqLenQ,
-          pad_seqlen_k,
-          kPadSeqLenK,
-          pad_headdim,
-          kPadHeadDim,
-          [&] {
-            using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                kPadSeqLenQ,
-                kPadSeqLenK,
-                kPadHeadDim, // kPadHeadDimQ,
-                kPadHeadDim, // kPadHeadDimV,
-                kHasBias,
-                false, // kHasBiasGrad place-holder
-                false, // kStoreLSE
-                kHasDropout,
-                occupancy>;
-
-            using FmhaPipelineProblem =
-                FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-            using FmhaPipeline =
-                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                    FmhaPipelineProblem>;
-
-            using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                kPadSeqLenQ,
-                kPadHeadDim>>;
-
-            using FmhaKernel =
-                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-            RunWithKernel<FmhaKernel>(param, stream);
-          });
-      /*
-            } else {
-              BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK, [&] {
-                using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                    true, // kPadSeqLenQ,
-                    kPadSeqLenK,
-                    true, // kPadHeadDimQ,
-                    true, // kPadHeadDimV,
-                    kHasBias,
-                    false, // kStoreLSE
-                    kHasDropout,
-                    occupancy>;
-
-                using FmhaPipelineProblem =
-                    FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-                using FmhaPipeline =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                        FmhaPipelineProblem>;
-
-                using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                    typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                    typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                    true,
-                    true>>;
-
-                using FmhaKernel =
-                    FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline,
-         FmhaEpilogue>;
-
-                RunWithKernel<FmhaKernel>(param, stream);
-              });
-            };
-      */
+      if (!use_async_pipeline) {
+        BOOL_SWITCH_4(
+            has_dropout,
+            kHasDropout,
+            pad_seqlen_q,
+            kPadSeqLenQ,
+            pad_seqlen_k,
+            kPadSeqLenK,
+            pad_headdim,
+            kPadHeadDim,
+            [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDim, // kPadHeadDimQ,
+                  kPadHeadDim, // kPadHeadDimV,
+                  kHasBias,
+                  false, // kHasBiasGrad place-holder
+                  false, // kStoreLSE
+                  kHasDropout,
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                      FmhaPipelineProblem>;
+
+              using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                  typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                  typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                  kPadSeqLenQ,
+                  kPadHeadDim>>;
+
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      } else {
+        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK, [&] {
+          using FmhaTraits = ck::tile_program::TileFmhaTraits<
+              true, // kPadSeqLenQ,
+              kPadSeqLenK,
+              true, // kPadHeadDimQ,
+              true, // kPadHeadDimV,
+              kHasBias,
+              false, // kHasBiasGrad place-holder
+              false, // kStoreLSE
+              kHasDropout,
+              occupancy>;
+
+          using FmhaPipelineProblem =
+              FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+          using FmhaPipeline =
+              ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                  FmhaPipelineProblem>;
+
+          using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+              typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+              typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+              true,
+              true>>;
+
+          using FmhaKernel =
+              FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+          RunWithKernel<FmhaKernel>(param, stream);
+        });
+      };
     });
   };
 

From 2c8626be546f457b3f7acca1328e777a6442c9c1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Apr 2024 07:08:15 +0000
Subject: [PATCH 531/641] Re-organize cpp instances for calling fmha infer
 kernel

---
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 16 ++--
 .../ck_tiled_fmha_batched_infer_bp16.cpp      | 77 ++++++++++++++-----
 .../ck_tiled_fmha_batched_infer_fp16.cpp      | 77 ++++++++++++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 22 +++---
 .../ck_tiled_fmha_grouped_infer_bp16.cpp      | 77 ++++++++++++++-----
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      | 77 ++++++++++++++-----
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  5 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  5 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 134 files changed, 1411 insertions(+), 171 deletions(-)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 2b43cb677..f67d266c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -33,8 +33,9 @@ template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-struct batched_infer_causalmask_bias_dispatch {
+struct batched_infer_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -59,7 +60,6 @@ struct batched_infer_causalmask_bias_dispatch {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
-      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
           ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
@@ -82,9 +82,7 @@ struct batched_infer_causalmask_bias_dispatch {
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
       if (!use_async_pipeline) {
-        BOOL_SWITCH_4(
-            has_dropout,
-            kHasDropout,
+        BOOL_SWITCH_3(
             pad_seqlen_q,
             kPadSeqLenQ,
             pad_seqlen_k,
@@ -124,7 +122,7 @@ struct batched_infer_causalmask_bias_dispatch {
               RunWithKernel<FmhaKernel>(param, stream);
             });
       } else {
-        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK, [&] {
+        BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK, [&] {
           using FmhaTraits = ck::tile_program::TileFmhaTraits<
               true, // kPadSeqLenQ,
               kPadSeqLenK,
@@ -228,13 +226,15 @@ template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-void run_batched_infer_causalmask_bias_dispatch(
+void run_batched_infer_causalmask_bias_dropout_dispatch(
     BatchedForwardParams& param,
     hipStream_t stream) {
-  batched_infer_causalmask_bias_dispatch<
+  batched_infer_causalmask_bias_dropout_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
+      kHasDropout,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
index 9a14373ad..cf7bacbe4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
@@ -12,57 +12,96 @@
 #include "ck_tiled_fmha_batched_infer.h"
 
 // clang-format off
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_infer_causalmask_bias_dispatch<
+        run_batched_infer_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_infer_causalmask_bias_dispatch<
+        run_batched_infer_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index d2f8e7bfe..533b86109 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -12,57 +12,96 @@
 #include "ck_tiled_fmha_batched_infer.h"
 
 // clang-format off
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_infer_causalmask_bias_dispatch<
+        run_batched_infer_causalmask_bias_dropout_dispatch<
             ck::half_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_infer_causalmask_bias_dispatch<
+        run_batched_infer_causalmask_bias_dropout_dispatch<
             ck::half_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index af5d9588b..2a1c02b4e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -24,16 +24,17 @@
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-struct grouped_infer_causalmask_bias_dispatch {
+struct grouped_infer_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -58,7 +59,6 @@ struct grouped_infer_causalmask_bias_dispatch {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
-      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
           ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
@@ -74,14 +74,8 @@ struct grouped_infer_causalmask_bias_dispatch {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
 
-      BOOL_SWITCH_3(
-          has_dropout,
-          kHasDropout,
-          pad_headdim_q,
-          kPadHeadDimQ,
-          pad_headdim_v,
-          kPadHeadDimV,
-          [&] {
+      BOOL_SWITCH_2(
+          pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
             using FmhaTraits = ck::tile_program::TileFmhaTraits<
                 kPadSeqLenQ,
                 kPadSeqLenK,
@@ -179,13 +173,15 @@ template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-void run_grouped_infer_causalmask_bias_dispatch(
+void run_grouped_infer_causalmask_bias_dropout_dispatch(
     GroupedForwardParams& param,
     hipStream_t stream) {
-  grouped_infer_causalmask_bias_dispatch<
+  grouped_infer_causalmask_bias_dropout_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
+      kHasDropout,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
index c76c6e6f8..80ef8a396 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
@@ -12,57 +12,96 @@
 #include "ck_tiled_fmha_grouped_infer.h"
 
 // clang-format off
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_infer_causalmask_bias_dispatch<
+        run_grouped_infer_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_infer_causalmask_bias_dispatch<
+        run_grouped_infer_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 4e4a1c101..73103a0e8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -12,57 +12,96 @@
 #include "ck_tiled_fmha_grouped_infer.h"
 
 // clang-format off
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_infer_causalmask_bias_dispatch<
+        run_grouped_infer_causalmask_bias_dropout_dispatch<
             ck::half_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_infer_causalmask_bias_dispatch<
+        run_grouped_infer_causalmask_bias_dropout_dispatch<
             ck::half_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index e5fab05c4..936789b59 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index a3c8f6bca..26454ef59 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index 3fc855dbb..97272b032 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 5573d58b7..913afceaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index 87f5c89ff..d3d4f0823 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index a17c6fabb..a11984f7a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index d4021ed8d..1712a317d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index e05151545..632fb0794 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 2f7ff1124..b8a1fde66 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 3935893ca..76b569cff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index b4a4a9fa7..ace85cec2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 035923269..3f1df08f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index 9251bccc5..eafa8238e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index e113097a6..5528f22dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index 0241586ac..ceaa26f4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 290d6b145..e87f2672b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..6b547e34e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..152c34e56
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..2db0507bd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..f9b0d1519
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..5a19fe469
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..0d9edb15d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..25928ff52
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..823e9e1d1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..109a6e914
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..b278bde42
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..23f5e10f7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..7e62dfe1f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..6fda3ae54
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..fcc5a2bd8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..cd7c4681b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..a2510ef7d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index 5aba53e37..91fa9cfb8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index 0d653b4e5..a8db3c21e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index 657708501..cf70efd4e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 666488a9f..2699d7a96 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index 47d1f4e51..98cdea404 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index 14d14ce8c..10444d7d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 85eed5de9..d70389373 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index 00de9f3ee..a6d22c666 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 21d46d793..6ba251a1a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 2c1779293..8da1f1e38 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index 90138a271..bb22a42a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 10396a224..ff98dd555 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index 42edd42a5..b310ad71f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index 078a28eb7..4e0ab2c07 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index f3791d766..4e3d7c989 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 23b8796ce..e619bcb8d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_infer.h"
 
-template void run_batched_infer_causalmask_bias_dispatch<
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..2d60996b8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..3a39fb4ae
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..1951d311c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..4557fe7aa
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..ae7739be4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..3594e81fd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..e4fb8dbad
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..a15494b0f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..81607aa68
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..86e5b5a66
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..07d487f6e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..83043e1c5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..f6ffe4963
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..3b57b10ce
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..00872610f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..0d69fcda0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_infer.h"
+
+template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index dacb2b5ff..32a098714 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index 49faae0f1..b67cc8ca6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index 79f83bbd6..77ecf2f4a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 965428a41..efae07d30 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index ffd8ac153..b8221e500 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index 4a4298a24..8f5458f9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 2584fcf0b..d64878a93 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index 5bf323e37..078c81ca0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index bfb0a8aab..13205e8c4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 46495e5f0..e399bfbce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index c52e17f7b..9c3081f7a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index a8197825f..60e847191 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index d409b257b..f030cbb00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index 8022f3e25..efc5b625a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index a8ab2616c..0b7037cec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index d0c3b76d6..7301fdb10 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..5b000a628
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..47c79b1af
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..463a621af
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..f53906c82
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..e25c9ece7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..093395947
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..3724a2886
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..a96ab0ce5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..f18bf1e8f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..cd0336e0d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..baf202b49
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..65c0c923d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..c9c1b385b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..4a5e084d9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..ae7440bf9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..5f6048cbb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index 3e0acc63a..0ea9c2176 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index f17c72caa..bc668d784 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index be812c79f..f2375b0a7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 360180c84..66de4bf3d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index ea0f83842..dce9620da 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index 6819de0a3..eaa255d2a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 3ab3cc5c2..1c1cee370 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index 7470f5b12..53434b15a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 238ef6acb..5a2c266d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 8647f8273..e8f0b6908 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index 28a808522..b316aa818 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 888f0f8e7..3cc34095b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index 7226ed616..069aa9ed6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index c8ec9fcdd..d09b9b0c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index 80e7378e9..64d6034b4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 826b22356..fac8e1cfa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_infer.h"
 
-template void run_grouped_infer_causalmask_bias_dispatch<
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..886537fad
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..3d72a5909
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..822dabadd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..8ad64cd69
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..1c9c324f6
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..e08afd8c0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..3289a3109
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..1c6cd7d3e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..fbf764fc5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..5fed583d5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..1825795eb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..45b21a50c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..e6a42bcc4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..592ad3232
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..af45ae222
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..03b28b79d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_infer.h"
+
+template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);

From bdd716c6ab5b373be23acea2c86f4603acda7b79 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Apr 2024 08:15:31 +0000
Subject: [PATCH 532/641] Re-organize cpp instances for calling fmha forward
 kernel

---
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 18 ++---
 .../ck_tiled_fmha_batched_forward_bp16.cpp    | 77 ++++++++++++++-----
 .../ck_tiled_fmha_batched_forward_fp16.cpp    | 77 ++++++++++++++-----
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 22 +++---
 .../ck_tiled_fmha_grouped_forward_bp16.cpp    | 77 ++++++++++++++-----
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    | 77 ++++++++++++++-----
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  5 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...ask_has_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...ask_has_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...mask_has_attnbias_no_dropout_maxk_256.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_32.cpp} |  5 +-
 ...lmask_has_attnbias_no_dropout_maxk_64.cpp} |  5 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |  3 +-
 ...mask_no_attnbias_has_dropout_maxk_256.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |  3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |  3 +-
 ...lmask_no_attnbias_no_dropout_maxk_256.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |  3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |  3 +-
 ...mask_has_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...mask_has_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...lmask_has_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...lmask_has_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...almask_has_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp | 16 ++++
 ...lmask_no_attnbias_has_dropout_maxk_256.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp | 16 ++++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp | 16 ++++
 ...almask_no_attnbias_no_dropout_maxk_256.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp | 16 ++++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp | 16 ++++
 134 files changed, 1412 insertions(+), 172 deletions(-)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (82%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (82%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (82%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (82%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (82%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index de7631449..a0151b979 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -24,16 +24,17 @@
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-struct batched_forward_causalmask_bias_dispatch {
+struct batched_forward_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -58,7 +59,6 @@ struct batched_forward_causalmask_bias_dispatch {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
-      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
           ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
@@ -82,9 +82,7 @@ struct batched_forward_causalmask_bias_dispatch {
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
       /*      if (!use_async_pipeline) { */
-      BOOL_SWITCH_4(
-          has_dropout,
-          kHasDropout,
+      BOOL_SWITCH_3(
           pad_seqlen_q,
           kPadSeqLenQ,
           pad_seqlen_k,
@@ -125,7 +123,7 @@ struct batched_forward_causalmask_bias_dispatch {
           });
       /*
             } else {
-              BOOL_SWITCH_2(has_dropout, kHasDropout, pad_seqlen_k, kPadSeqLenK,
+              BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK,
          [&] { using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits< true, //
          kPadSeqLenQ, kPadSeqLenK, true, // kPadHeadDimQ true, // kPadHeadDimV
                     kHasBias,
@@ -229,13 +227,15 @@ template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-void run_batched_forward_causalmask_bias_dispatch(
+void run_batched_forward_causalmask_bias_dropout_dispatch(
     BatchedForwardParams& param,
     hipStream_t stream) {
-  batched_forward_causalmask_bias_dispatch<
+  batched_forward_causalmask_bias_dropout_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
+      kHasDropout,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
index 6dad19459..80ba53eb4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
@@ -13,57 +13,96 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_forward_causalmask_bias_dispatch<
+        run_batched_forward_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_forward_causalmask_bias_dispatch<
+        run_batched_forward_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 73cd2e7fe..450a70de2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -13,57 +13,96 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+    BatchedForwardParams& param, hipStream_t stream);
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_batched_forward_causalmask_bias_dispatch<
+        run_batched_forward_causalmask_bias_dropout_dispatch<
             ck::half_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_batched_forward_causalmask_bias_dispatch<
+        run_batched_forward_causalmask_bias_dropout_dispatch<
             ck::half_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index c50f50e7c..0b348bd0e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -23,16 +23,17 @@
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_epilogue.hpp"
+#include "fmha_fwd_kernel.hpp"
 #include "fmha_fwd_tile_partitioner.hpp"
 
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-struct grouped_forward_causalmask_bias_dispatch {
+struct grouped_forward_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaPipelineProblem<
@@ -57,7 +58,6 @@ struct grouped_forward_causalmask_bias_dispatch {
 
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
-      const bool has_dropout = (param.dropout_prob > 0.0f);
 
       using FmhaMask =
           ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
@@ -74,14 +74,8 @@ struct grouped_forward_causalmask_bias_dispatch {
           !(param.K % FmhaFwdShape_::kK0BlockLength == 0);
       const bool pad_headdim_v = !(param.Kv % FmhaFwdShape_::kN1 == 0);
 
-      BOOL_SWITCH_3(
-          has_dropout,
-          kHasDropout,
-          pad_headdim_q,
-          kPadHeadDimQ,
-          pad_headdim_v,
-          kPadHeadDimV,
-          [&] {
+      BOOL_SWITCH_2(
+          pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
             using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
                 kPadSeqLenQ,
                 kPadSeqLenK,
@@ -181,13 +175,15 @@ template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
+    bool kHasDropout,
     ck::index_t MaxK>
-void run_grouped_forward_causalmask_bias_dispatch(
+void run_grouped_forward_causalmask_bias_dropout_dispatch(
     GroupedForwardParams& param,
     hipStream_t stream) {
-  grouped_forward_causalmask_bias_dispatch<
+  grouped_forward_causalmask_bias_dropout_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
+      kHasDropout,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
index 50e3bac62..f9d768c8c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
@@ -13,57 +13,96 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::bhalf_t, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_forward_causalmask_bias_dispatch<
+        run_grouped_forward_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_forward_causalmask_bias_dispatch<
+        run_grouped_forward_causalmask_bias_dropout_dispatch<
             ck::bhalf_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index f566a6d2c..abeba91f6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -13,57 +13,96 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dispatch<ck::half_t, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+    GroupedForwardParams& param, hipStream_t stream);
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH(param.has_attn_bias, kHasBias, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
-        run_grouped_forward_causalmask_bias_dispatch<
+        run_grouped_forward_causalmask_bias_dropout_dispatch<
             ck::half_t,
             false,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-        run_grouped_forward_causalmask_bias_dispatch<
+        run_grouped_forward_causalmask_bias_dropout_dispatch<
             ck::half_t,
             true,
             kHasBias,
+            kHasDropout,
             MaxK>(param, stream);
       else
         throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index 0e2eced3c..dbf8459d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index 0ee352e55..0bc2865fc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index 3ce3f2fd3..9390f08a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 11674e05d..dea796009 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index 51996f5f8..18ace4cc5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index 2edf0c9b7..1dc1c67ed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 00e19f71d..16f51cf1a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index 235c706f3..95731a02e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 99f9d3dc4..3c274c3d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 078bfe33f..0c4156faf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index c6c070287..dfd127839 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 529837a27..3b52555be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index a7aec2f1a..657a99865 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index d99707cf3..263d46e27 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index f723ed872..775c6c1b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 5d0095c4f..4a6a7ee89 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..c2a2db586
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..bc20e97bd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..d6709f88e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..95eb46660
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..a4ca78d9e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..e515cfbb5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..7f573e21e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..6980a4141
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..a6784236f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..df6c6c72d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..394728af1
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..b2ef9186f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..4abe212c7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..bab70f814
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..8b8cc0a16
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..c2f4badc4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index c8b985564..249c4f425 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index e0beb8f59..33ea7c25a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index a58be730c..fcc6ac153 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 5ef660d35..f7547b577 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index c12bcafdb..dd28c7c87 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index bb1126829..808d4e710 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 6911476cb..72c6714a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index f9aaf8a71..f0c6d5967 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 3c6aa04c5..5f0d70239 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 00aacf534..0ac3953bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index 9e2963e42..22586dc95 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 93972071b..8ea49cdfd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index c1d701e6e..505d4d048 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index 01435a301..a438cca43 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index e499377ba..96fd2bbb2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 8cf6fe551..4a5105996 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_forward.h"
 
-template void run_batched_forward_causalmask_bias_dispatch<
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..ca332b921
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..2791fc6ff
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..f40ba4ec3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..03a78009e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..bd319545a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..97f7fbd46
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..5edd0cd40
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..4e0f85734
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..da15841a3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..f2ba8c911
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..93ef1d810
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..ab6382b62
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..84deea900
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..cf24162f4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..392151f6d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..2960c998b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_forward.h"
+
+template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index 6b74ac612..e801c3f93 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index d973d299b..da3f9451c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index 3ff1b2901..097cc7bf6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 1347d1da8..26f0cb5ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index b4320968e..48887ba1b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index cdb13030e..8b49d8374 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 344307c4c..49402375a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index 0da1dbf1a..a402d9805 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 5a078f4ad..d5f2785d7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index 7654c11cd..9a7c28fb5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index dd8ee2879..e8e1a889f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index a8604cd7c..cf0245833 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index d9e339266..ba58b2a3a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index 339c05c01..3f472877d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index 9c600d90e..533d97a53 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index c2ecf7d9d..48672f2e0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..ec2af1f10
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..44f5e1e41
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..498e15bcd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..e08bd87d2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..ccf7b1e1f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..1c0dee6a3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..d7fdf6789
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..b91e4a3ea
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..4a208cf12
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..07b92f6fb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..d561c4e08
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..21a57dfca
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..7088d0d9d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..f4cc5ac8f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..2f8b750df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..ac9d81f95
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
index 6f1a866bd..c9b178a76 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
index 60a5f9444..82533dfa9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
index c549aec61..090d3465d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
index 8198f3beb..99bf4bee6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
index d5fa2c40c..2290c9410 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
index 59d0142be..a685ec502 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
index 503b4c245..22e90a4cc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
index f63f6b44d..b44e85089 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 3bbfba9ba..c9742c970 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
index ecb005898..dab84d1f5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index 53ce1e962..109bf6cdc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 80a645aa8..79a9ecc5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index dd27d65c0..c6d8e12e2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
index a945f4190..cdd4a6b4f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index 03c98bd6f..7e1478866 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 82%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 451004e3b..a98daba6c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_forward.h"
 
-template void run_grouped_forward_causalmask_bias_dispatch<
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..5fe2e08fc
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..f645e1473
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..686f65bca
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..f7aa2630b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..6b851c95d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..83b4ca32e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..35472c1e8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..c4f645028
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..72022fb98
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..48d249424
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..0207a2691
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..8cdf11645
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..137412fd9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..a1fccefe0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..273593b9d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..8b638fa32
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_forward.h"
+
+template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);

From 44d4592dd85366f4db95b052000decce838b7e89 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Apr 2024 09:32:34 +0000
Subject: [PATCH 533/641] Re-organize cpp instances for calling fmha backward
 kernel

---
 .../hip_fmha/ck_tiled_fmha_batched_backward.h |  12 ++-
 .../ck_tiled_fmha_batched_backward_bp16.cpp   |  92 ++++++++++++----
 .../ck_tiled_fmha_batched_backward_fp16.cpp   |  92 ++++++++++++----
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h |  11 +-
 .../ck_tiled_fmha_grouped_backward_bp16.cpp   | 100 +++++++++++++-----
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   |  92 ++++++++++++----
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...nbias_has_biasgrad_no_dropout_maxk_32.cpp} |   3 +-
 ...nbias_has_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   3 +-
 ...nbias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...nbias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...nbias_no_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |   3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |   3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |   5 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...nbias_has_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...nbias_no_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...tnbias_no_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp |  17 +++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp |  17 +++
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   3 +-
 ...nbias_has_biasgrad_no_dropout_maxk_32.cpp} |   3 +-
 ...nbias_has_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...nbias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...nbias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...nbias_no_biasgrad_no_dropout_maxk_128.cpp} |   3 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |   5 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |   3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |   5 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...nbias_has_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...nbias_no_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...tnbias_no_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp |  17 +++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp |  17 +++
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...nbias_has_biasgrad_no_dropout_maxk_32.cpp} |   3 +-
 ...nbias_has_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   3 +-
 ...nbias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...nbias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...nbias_no_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |   3 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |   3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |   5 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...nbias_has_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...nbias_no_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...tnbias_no_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp |  17 +++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp |  17 +++
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   3 +-
 ...nbias_has_biasgrad_no_dropout_maxk_32.cpp} |   3 +-
 ...nbias_has_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...nbias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...nbias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...nbias_no_biasgrad_no_dropout_maxk_128.cpp} |   3 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...tnbias_no_biasgrad_no_dropout_maxk_64.cpp} |   3 +-
 ...mask_no_attnbias_has_dropout_maxk_128.cpp} |   5 +-
 ...lmask_no_attnbias_has_dropout_maxk_32.cpp} |   3 +-
 ...lmask_no_attnbias_has_dropout_maxk_64.cpp} |   5 +-
 ...lmask_no_attnbias_no_dropout_maxk_128.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_32.cpp} |   3 +-
 ...almask_no_attnbias_no_dropout_maxk_64.cpp} |   3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...nbias_has_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...nbias_has_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...tnbias_has_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...nbias_no_biasgrad_has_dropout_maxk_128.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_32.cpp |  17 +++
 ...tnbias_no_biasgrad_has_dropout_maxk_64.cpp |  17 +++
 ...tnbias_no_biasgrad_no_dropout_maxk_128.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_32.cpp |  17 +++
 ...ttnbias_no_biasgrad_no_dropout_maxk_64.cpp |  17 +++
 ...lmask_no_attnbias_has_dropout_maxk_128.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_32.cpp |  17 +++
 ...almask_no_attnbias_has_dropout_maxk_64.cpp |  17 +++
 ...almask_no_attnbias_no_dropout_maxk_128.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_32.cpp |  17 +++
 ...salmask_no_attnbias_no_dropout_maxk_64.cpp |  17 +++
 150 files changed, 1688 insertions(+), 199 deletions(-)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp} (83%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp} (83%)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 904cd930e..28cddb133 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -33,8 +33,9 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasBiasGrad,
+    bool kHasDropout,
     ck::index_t MaxK>
-struct batched_backward_causalmask_bias_dispatch {
+struct batched_backward_causalmask_bias_dropout_dispatch {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
       typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
@@ -111,7 +112,6 @@ struct batched_backward_causalmask_bias_dispatch {
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
         constexpr ck::index_t occupancy = 1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
-        const bool has_dropout = (param.dropout_prob > 0.0f);
 
         using FmhaMask =
             ck::tile_program::block::SimplifiedGenericAttentionMask<
@@ -130,7 +130,7 @@ struct batched_backward_causalmask_bias_dispatch {
         // to determine whether to do padding saving some compiling time
         const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
-        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_headdim, kPadHeadDim, [&] {
+        BOOL_SWITCH(pad_headdim, kPadHeadDim, [&] {
           using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
               kPadSeqLenQ,
               kPadSeqLenK,
@@ -283,14 +283,16 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasBiasGrad,
+    bool kHasDropout,
     ck::index_t MaxK>
-void run_batched_backward_causalmask_bias_dispatch(
+void run_batched_backward_causalmask_bias_dropout_dispatch(
     BatchedBackwardParams& param,
     hipStream_t stream) {
-  batched_backward_causalmask_bias_dispatch<
+  batched_backward_causalmask_bias_dropout_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
       kHasBiasGrad,
+      kHasDropout,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
index db2b56742..87f4ad107 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
@@ -13,64 +13,112 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias, kHasBias, param.bias_has_grad, kHasBiasGrad, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_3(
+      param.has_attn_bias,
+      kHasBias,
+      param.bias_has_grad,
+      kHasBiasGrad,
+      has_dropout,
+      kHasDropout,
+      [&] {
         if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_batched_backward_causalmask_bias_dispatch<
+              run_batched_backward_causalmask_bias_dropout_dispatch<
                   ck::bhalf_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_batched_backward_causalmask_bias_dispatch<
+              run_batched_backward_causalmask_bias_dropout_dispatch<
                   ck::bhalf_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index 462309435..ed39b5a89 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -13,64 +13,112 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 64>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 128>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias, kHasBias, param.bias_has_grad, kHasBiasGrad, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_3(
+      param.has_attn_bias,
+      kHasBias,
+      param.bias_has_grad,
+      kHasBiasGrad,
+      has_dropout,
+      kHasDropout,
+      [&] {
         if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_batched_backward_causalmask_bias_dispatch<
+              run_batched_backward_causalmask_bias_dropout_dispatch<
                   ck::half_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_batched_backward_causalmask_bias_dispatch<
+              run_batched_backward_causalmask_bias_dropout_dispatch<
                   ck::half_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index c61cf11bc..45d3859a6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -33,8 +33,9 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasBiasGrad,
+    bool kHasDropout,
     ck::index_t MaxK>
-struct grouped_backward_causalmask_bias_dispatch {
+struct grouped_backward_causalmask_bias_dropout_dispatch {
   using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
       typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
@@ -128,7 +129,7 @@ struct grouped_backward_causalmask_bias_dispatch {
         // to determine whether to do padding saving some compiling time
         const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
-        BOOL_SWITCH_2(has_dropout, kHasDropout, pad_headdim, kPadHeadDim, [&] {
+        BOOL_SWITCH(pad_headdim, kPadHeadDim, [&] {
           using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
               kPadSeqLenQ,
               kPadSeqLenK,
@@ -270,14 +271,16 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasBiasGrad,
+    bool kHasDropout,
     ck::index_t MaxK>
-void run_grouped_backward_causalmask_bias_dispatch(
+void run_grouped_backward_causalmask_bias_dropout_dispatch(
     GroupedBackwardParams& param,
     hipStream_t stream) {
-  grouped_backward_causalmask_bias_dispatch<
+  grouped_backward_causalmask_bias_dropout_dispatch<
       ScalarType,
       kHasCausalMask,
       kHasBias,
       kHasBiasGrad,
+      kHasDropout,
       MaxK>::Run(param, stream);
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
index f0164e470..6db554405 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
@@ -13,68 +13,112 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::bhalf_t, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_3(
       param.has_attn_bias,
-      HAS_ATTN_BIAS,
+      kHasBias,
       param.bias_has_grad,
-      HAS_BIAS_GRAD,
+      kHasBiasGrad,
+      has_dropout,
+      kHasDropout,
       [&] {
-        if constexpr (HAS_ATTN_BIAS || !HAS_BIAS_GRAD) {
+        if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_grouped_backward_causalmask_bias_dispatch<
+              run_grouped_backward_causalmask_bias_dropout_dispatch<
                   ck::bhalf_t,
                   false,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_grouped_backward_causalmask_bias_dispatch<
+              run_grouped_backward_causalmask_bias_dropout_dispatch<
                   ck::bhalf_t,
                   true,
-                  HAS_ATTN_BIAS,
-                  HAS_BIAS_GRAD,
+                  kHasBias,
+                  kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index 7703b742c..3dfc6f7f1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -13,64 +13,112 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dispatch<ck::half_t, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 64>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 128>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
-  BOOL_SWITCH_2(
-      param.has_attn_bias, kHasBias, param.bias_has_grad, kHasBiasGrad, [&] {
+  const bool has_dropout = (param.dropout_prob > 0.0f);
+  BOOL_SWITCH_3(
+      param.has_attn_bias,
+      kHasBias,
+      param.bias_has_grad,
+      kHasBiasGrad,
+      has_dropout,
+      kHasDropout,
+      [&] {
         if constexpr (kHasBias || !kHasBiasGrad) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
-              run_grouped_backward_causalmask_bias_dispatch<
+              run_grouped_backward_causalmask_bias_dropout_dispatch<
                   ck::half_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
-              run_grouped_backward_causalmask_bias_dispatch<
+              run_grouped_backward_causalmask_bias_dropout_dispatch<
                   ck::half_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
+                  kHasDropout,
                   MaxK>(param, stream);
             else
               throw std::runtime_error("Invalid custom_mask_type value");
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
index f6bf4bd6f..53ab69fc2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     true,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
index 0514bf28a..17e2eef9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     true,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
index ee19b37de..e5903a262 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     true,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
index 484d96a41..3d93e9168 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    true,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
index 75966fb73..7c827865f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
index 07dc496fd..34e32791e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
index 8ab4f4229..0f2ad6e78 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
index 8f22808ad..746539438 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
index e173fd0cb..46de1be23 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
index 395d187a7..fea36c72b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
index 89a5c0624..f570c926e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
index 3d9272061..463aa81de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 736256e63..6186abdf8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
     false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index c44a2f99e..175fbaf4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
     false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 09f17fb59..c8e379d59 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index 11023b667..2a535ec0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index 1ca23fead..74e6105e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index f71dedaaf..fa3b403a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..6d1a95675
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..2c227abf2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..7375b1aca
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..d987a2516
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..9cf279c5b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..62f5b6e56
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..afe52ab8b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..5619a5029
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..6b04d766a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..693ac4f26
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..aa754420b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..04badab08
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..366e6a68e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..0f0c58743
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..2a8279443
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..c943f2ea3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..6cfe5c349
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..4c2d55d06
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
index cb146d6c5..c7c2bf020 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
     true,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
index 32b7d5373..970c63e14 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
     true,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
index 42e57c6a8..cbde5ad7f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
     true,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
index 442263f0c..b382ff62f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
index 9d20c01c5..d7b02b3c2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
index 95d62e3da..490fe4261 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
index 94477c6a6..9b50b4648 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     true,
+    false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
index 2dc072271..acce3f824 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     true,
+    false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
index abb6f7933..bf3c4e2bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     true,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
index 074f41cdb..1dc265944 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
index 77395133b..d6c19a81b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
index 50687e28b..290b1c60d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index 3f2b9ddec..f97b3829d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index cea3242f4..42a2945dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
     false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 1bb5433e9..dd60fbab5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index f0e4e22af..dc07dbddf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index fc49a0182..0800dd7ca 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 8deba9920..d0ea35d54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_batched_backward.h"
 
-template void run_batched_backward_causalmask_bias_dispatch<
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..54b193591
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..acc06d663
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..349ef3190
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..4fc4e8bbd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..82ec79aca
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..2d9fb867a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..878d2b968
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..5dea3b92d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..614dc4af5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..fae40a708
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..1bee92536
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..fe583539d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..0da1f95e4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..01c850509
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..b85f2ac56
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..dd77dc88c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..30fc3c1dd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..e6184baf5
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
index 06974cabb..529a8931c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     true,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
index 7bc1dafae..eca64f382 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     true,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
index e08c2d2a0..03de22668 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     true,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
index fcd672d9e..be2d54836 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    true,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
index 8bdecd02e..eac3e148d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
index b68b7f0f1..bd0ce8e79 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
+    true,
     false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
index 7f745b005..3b24da32a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
     false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
index 18151b2ce..ec9f2db83 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
index f7f164720..1d0d05754 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
index 4c81b91d1..7028cb7dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
index 4ea3986a5..0a15c5dad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    true,
+    false,
     false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
index 42f675373..01d422c00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index bd728f967..4f39ed253 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
     false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index 1daa01062..0f586cdc4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     false,
     false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index 67caf36b2..88ac4b243 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index 44e53a806..d1d05d05a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index 9034115fe..8721df90e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 25e2ba32a..08646ecca 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..9cf7db73f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..a8d69e619
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..4391d4d7d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..5343b0c3a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..a67bb299d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..4a3d28b51
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..148314356
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..305697e7b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..ad7cdd703
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..fff043eb0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..b15836d17
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..e671f3ca2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    true,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..e9f870c4c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..66fc7c9b3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..5001ac06e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..98836e82a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..696e14ca3
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..1e1226c57
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::bhalf_t,
+    false,
+    false,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
index fb50648f4..9b7520411 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
     true,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
index a3e58ba19..40c3e2566 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
     true,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
index 445f59fb5..4c1939000 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
     true,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
index 0e6209988..c259e3b89 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
index 01d441c5f..8e6d377fc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
index c332b580a..c5ec3f4fb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     true,
+    true,
     false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
index bbfe4fc48..bfc021bc9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     true,
+    false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
index b0eea03c4..76d4ae719 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     true,
+    false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
index 035e4c43e..a3b402dfa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     true,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
index 1b61d184a..9b04b655a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
index a6c364146..b58450208 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    true,
+    false,
     false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
index 4664327ca..b77d5ceaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
+    true,
     false,
     false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
index f4a38dab8..b4a55a585 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
index d8ddfbb5d..7d2ed485a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
     true,
     false,
     false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
index f45d7495f..8ff66d0b0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -8,9 +8,10 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
index 440c1b41a..ba4dee3e8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
index cc2945436..9f968835e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
similarity index 83%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
index 00b2f08d6..bea50e4e6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -8,8 +8,9 @@
 
 #include "ck_tiled_fmha_grouped_backward.h"
 
-template void run_grouped_backward_causalmask_bias_dispatch<
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    true,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..ee30cdf9f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..68996ba94
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..90e924410
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..dca1cfdae
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..0da0b4fd4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..5fb6beace
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..84478d932
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..574a1271b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..534684ec4
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..a70c75ccf
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..62437cb36
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..d91b9c648
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    true,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
new file mode 100644
index 000000000..cc82da7ef
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
new file mode 100644
index 000000000..7a389f87d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
new file mode 100644
index 000000000..2bac6d9f8
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
new file mode 100644
index 000000000..cff4bd138
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
new file mode 100644
index 000000000..1173b7292
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
new file mode 100644
index 000000000..8159058ba
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <ck/ck.hpp>
+
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck::half_t,
+    false,
+    false,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);

From 51ca91bba0d56f3f2cb31d48159f664104a93a82 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Apr 2024 11:02:19 +0000
Subject: [PATCH 534/641] Position the composable_kernel_tiled to
 ck_tile/opt_padding_fa_train branch

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 325ca5fbf..e2435dd05 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel-internal.git
-	branch = ck_tile/opt_padding_fa_train_pr2
+	branch = ck_tile/opt_padding_fa_train
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index f949afaea..6c886a030 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit f949afaea4abfc426676b7b9cb7e931664f9b5e8
+Subproject commit 6c886a030d1763660f8c519ee28990c3cc3067ae

From 16936839ffa0e4a246153364e276692beca5945e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Apr 2024 15:06:38 +0000
Subject: [PATCH 535/641] Update to synchronize with the latest commits in
 ck_tile/opt_padding_fa_train

---
 third_party/composable_kernel_tiled           |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 31 ++++-----
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h      | 69 +++++++++++++------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 31 ++++-----
 4 files changed, 79 insertions(+), 54 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 6c886a030..7192a46c6 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 6c886a030d1763660f8c519ee28990c3cc3067ae
+Subproject commit 7192a46c65056b34d436bb74045db36f47aac05c
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 28cddb133..4c979ecc2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -15,7 +15,6 @@
 
 #include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dot_do_o.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_dispatcher.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_problem.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
@@ -41,8 +40,6 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
       typename FmhaBwdTypeConfig<ScalarType>::VGradDataType>>;
 
-  using FmhaBwdLoadStrategy_ = typename FmhaBwdLoadStrategy<MaxK>::type;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaBwdPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaBwdPipelineProblem<
@@ -145,17 +142,19 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           using FmhaBwdPipelineProblem =
               FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
 
-          using FmhaBwdPipeline_ =
-              typename ck::tile_program::block::BlockFmhaBwdPipelineDispatcher<
-                  FmhaBwdLoadStrategy_,
-                  FmhaBwdPipelineProblem>::BlockPipeline;
+          constexpr auto FmhaBwdPipelineEnum_ =
+              FmhaBwdPipelineEnumSelector<MaxK>::value;
+
+          using FmhaBwdPipeline_ = typename FmhaBwdPipelineMaker<
+              FmhaBwdPipelineEnum_,
+              FmhaBwdPipelineProblem>::pipeline;
 
-          using FmhaBwdQKVGradKernel_ = FmhaBwdQKVGradKernel<
+          using FmhaBwdDQDKDVKernel_ = FmhaBwdDQDKDVKernel<
               FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
               FmhaBwdEpilogue_>;
 
-          RunWithBwdQKVGradKernel<FmhaBwdQKVGradKernel_>(param, stream);
+          RunWithBwdDQDKDVKernel<FmhaBwdDQDKDVKernel_>(param, stream);
         });
       });
     };
@@ -197,12 +196,12 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         kargs);
   }
 
-  template <typename FmhaBwdQKVGradKernel>
-  static void RunWithBwdQKVGradKernel(
+  template <typename FmhaBwdDQDKDVKernel>
+  static void RunWithBwdDQDKDVKernel(
       BatchedBackwardParams& param,
       hipStream_t stream) {
     const auto kargs = [&] {
-      return FmhaBwdQKVGradKernel::MakeKargs(
+      return FmhaBwdDQDKDVKernel::MakeKargs(
           param.q_ptr,
           param.k_ptr,
           param.v_ptr,
@@ -264,13 +263,13 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           {param.philox_seed, param.philox_offset});
     }();
 
-    dim3 kGridSize = FmhaBwdQKVGradKernel::GridSize(param.B, param.Hq, param.N);
-    constexpr dim3 kBlockSize = FmhaBwdQKVGradKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdQKVGradKernel::kBlockPerCu;
+    dim3 kGridSize = FmhaBwdDQDKDVKernel::GridSize(param.B, param.Hq, param.N);
+    constexpr dim3 kBlockSize = FmhaBwdDQDKDVKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdDQDKDVKernel::kBlockPerCu;
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
-        FmhaBwdQKVGradKernel{},
+        FmhaBwdDQDKDVKernel{},
         kGridSize,
         kBlockSize,
         0,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 1d004dc8a..08cb7ba2b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -6,6 +6,10 @@
  */
 #pragma once
 
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_kts_vr.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_vr.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dq_dk_dv_pipeline_qs_ks_vr_dos.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_enum.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 
 template <typename DataType>
@@ -49,24 +53,6 @@ struct FmhaBwdTypeConfig<ck::bhalf_t> {
   using BiasGradDataType = ck::bhalf_t;
 };
 
-template <ck::index_t MaxK>
-struct FmhaBwdLoadStrategy;
-
-template <>
-struct FmhaBwdLoadStrategy<32> {
-  using type = ck::Sequence<true, false, true, false, true, true, false>;
-};
-
-template <>
-struct FmhaBwdLoadStrategy<64> {
-  using type = ck::Sequence<false, false, true, true, true, false, false>;
-};
-
-template <>
-struct FmhaBwdLoadStrategy<128> {
-  using type = ck::Sequence<false, false, true, false, true, false, false>;
-};
-
 template <ck::index_t MaxK>
 struct FmhaBwdBlockTile;
 
@@ -96,7 +82,6 @@ struct FmhaBwdShape;
 template <>
 struct FmhaBwdShape<32> : ck::tile_program::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<32>::type,
-                              typename FmhaBwdLoadStrategy<32>::type,
                               FmhaBwdBlockWarps0,
                               FmhaBwdWarpTile,
                               FmhaBwdBlockWarps1,
@@ -111,7 +96,6 @@ struct FmhaBwdShape<32> : ck::tile_program::TileFmhaBwdShape<
 template <>
 struct FmhaBwdShape<64> : ck::tile_program::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<64>::type,
-                              typename FmhaBwdLoadStrategy<64>::type,
                               FmhaBwdBlockWarps0,
                               FmhaBwdWarpTile,
                               FmhaBwdBlockWarps1,
@@ -126,7 +110,6 @@ struct FmhaBwdShape<64> : ck::tile_program::TileFmhaBwdShape<
 template <>
 struct FmhaBwdShape<128> : ck::tile_program::TileFmhaBwdShape<
                                typename FmhaBwdBlockTile<128>::type,
-                               typename FmhaBwdLoadStrategy<128>::type,
                                FmhaBwdBlockWarps0,
                                FmhaBwdWarpTile,
                                FmhaBwdBlockWarps1,
@@ -137,3 +120,47 @@ struct FmhaBwdShape<128> : ck::tile_program::TileFmhaBwdShape<
                                FmhaBwdWarpTile,
                                FmhaBwdBlockWarps2,
                                FmhaBwdWarpTile> {};
+
+template <ck::index_t MaxK>
+struct FmhaBwdPipelineEnumSelector;
+
+template <>
+struct FmhaBwdPipelineEnumSelector<32> {
+  static constexpr ck::BlockFmhaBwdPipelineEnum value =
+      ck::BlockFmhaBwdPipelineEnum::QSKSVROGradS;
+};
+
+template <>
+struct FmhaBwdPipelineEnumSelector<64> {
+  static constexpr ck::BlockFmhaBwdPipelineEnum value =
+      ck::BlockFmhaBwdPipelineEnum::KSKTSVR;
+};
+
+template <>
+struct FmhaBwdPipelineEnumSelector<128> {
+  static constexpr ck::BlockFmhaBwdPipelineEnum value =
+      ck::BlockFmhaBwdPipelineEnum::KSVR;
+};
+
+template <ck::BlockFmhaBwdPipelineEnum value, typename problem>
+struct FmhaBwdPipelineMaker;
+
+template <typename problem>
+struct FmhaBwdPipelineMaker<
+    ck::BlockFmhaBwdPipelineEnum::QSKSVROGradS,
+    problem> {
+  using pipeline =
+      ck::tile_program::block::BlockFmhaBwdDQDKDVPipelineQSKSVROGradS<problem>;
+};
+
+template <typename problem>
+struct FmhaBwdPipelineMaker<ck::BlockFmhaBwdPipelineEnum::KSKTSVR, problem> {
+  using pipeline =
+      ck::tile_program::block::BlockFmhaBwdDQDKDVPipelineKSKTSVR<problem>;
+};
+
+template <typename problem>
+struct FmhaBwdPipelineMaker<ck::BlockFmhaBwdPipelineEnum::KSVR, problem> {
+  using pipeline =
+      ck::tile_program::block::BlockFmhaBwdDQDKDVPipelineKSVR<problem>;
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 45d3859a6..881f07b52 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -15,7 +15,6 @@
 
 #include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dot_do_o.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_dispatcher.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_problem.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
@@ -41,8 +40,6 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
       typename FmhaBwdTypeConfig<ScalarType>::VGradDataType>>;
 
-  using FmhaBwdLoadStrategy_ = typename FmhaBwdLoadStrategy<MaxK>::type;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaBwdPipelineProblemTemp =
       ck::tile_program::block::BlockFmhaBwdPipelineProblem<
@@ -144,17 +141,19 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           using FmhaBwdPipelineProblem =
               FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
 
-          using FmhaBwdPipeline_ =
-              typename ck::tile_program::block::BlockFmhaBwdPipelineDispatcher<
-                  FmhaBwdLoadStrategy_,
-                  FmhaBwdPipelineProblem>::BlockPipeline;
+          constexpr auto FmhaBwdPipelineEnum_ =
+              FmhaBwdPipelineEnumSelector<MaxK>::value;
+
+          using FmhaBwdPipeline_ = typename FmhaBwdPipelineMaker<
+              FmhaBwdPipelineEnum_,
+              FmhaBwdPipelineProblem>::pipeline;
 
-          using FmhaBwdQKVGradKernel_ = FmhaBwdQKVGradKernel<
+          using FmhaBwdDQDKDVKernel_ = FmhaBwdDQDKDVKernel<
               FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
               FmhaBwdEpilogue_>;
 
-          RunWithBwdQKVGradKernel<FmhaBwdQKVGradKernel_>(param, stream);
+          RunWithBwdDQDKDVKernel<FmhaBwdDQDKDVKernel_>(param, stream);
         });
       });
     };
@@ -194,12 +193,12 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
         kargs);
   }
 
-  template <typename FmhaBwdQKVGradKernel>
-  static void RunWithBwdQKVGradKernel(
+  template <typename FmhaBwdDQDKDVKernel>
+  static void RunWithBwdDQDKDVKernel(
       GroupedBackwardParams& param,
       hipStream_t stream) {
     const auto kargs = [&] {
-      return FmhaBwdQKVGradKernel::MakeKargs(
+      return FmhaBwdDQDKDVKernel::MakeKargs(
           param.q_ptr,
           param.k_ptr,
           param.v_ptr,
@@ -251,14 +250,14 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           {param.philox_seed, param.philox_offset});
     }();
 
-    dim3 kGridSize = FmhaBwdQKVGradKernel::GridSize(
+    dim3 kGridSize = FmhaBwdDQDKDVKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_k);
-    constexpr dim3 kBlockSize = FmhaBwdQKVGradKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdQKVGradKernel::kBlockPerCu;
+    constexpr dim3 kBlockSize = FmhaBwdDQDKDVKernel::BlockSize();
+    constexpr ck::index_t kBlockPerCu = FmhaBwdDQDKDVKernel::kBlockPerCu;
 
     (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
         StreamConfig{stream, false},
-        FmhaBwdQKVGradKernel{},
+        FmhaBwdDQDKDVKernel{},
         kGridSize,
         kBlockSize,
         0,

From b7aa908348e6e453a0c713ec518cd9647047441d Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Fri, 26 Apr 2024 05:44:41 +0000
Subject: [PATCH 536/641] update submodule to public

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index e2435dd05..e761e7598 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,5 +6,5 @@
 	url = https://github.com/Dao-AILab/flash-attention.git
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
-	url = https://github.com/ROCm/composable_kernel-internal.git
+	url = https://github.com/ROCm/composable_kernel.git
 	branch = ck_tile/opt_padding_fa_train

From b4fa26da052397a37ff4b4542a01438906467ca4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 6 May 2024 08:52:02 +0000
Subject: [PATCH 537/641] Update to the criteria for padding seqlen_k in
 batched infer/forward

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h    | 3 ++-
 xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index a0151b979..501f0c675 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -69,7 +69,8 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
       const bool pad_seqlen_q = !(param.M % FmhaFwdShape_::kM0 == 0);
-      const bool pad_seqlen_k = !(param.N % FmhaFwdShape_::kN0 == 0);
+      const bool pad_seqlen_k =
+          (param.N == 0) || !(param.N % FmhaFwdShape_::kN0 == 0);
       const bool pad_headdim_q =
           !(param.K % FmhaFwdShape_::kK0BlockLength == 0);
       const bool pad_headdim_v = !(param.Kv % FmhaFwdShape_::kN1 == 0);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index f67d266c1..acd967f14 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -70,7 +70,8 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
       const bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
-      const bool pad_seqlen_k = !(param.N % FmhaShape::kN0 == 0);
+      const bool pad_seqlen_k =
+          (param.N == 0) || !(param.N % FmhaShape::kN0 == 0);
       const bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
       const bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
 

From ee7950f5708f3237c7fcea46d22551cc11b4d946 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 6 May 2024 18:05:51 +0000
Subject: [PATCH 538/641] Keep latest track of ck-tile commits

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 7192a46c6..d1da1e311 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 7192a46c65056b34d436bb74045db36f47aac05c
+Subproject commit d1da1e311891243948c51ea6b58861ceadfd4000

From 74dfdfec159ec55f6f226836342914dee52afadc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 8 May 2024 08:43:45 +0000
Subject: [PATCH 539/641] Tiny fixing to the decoder including

---
 xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h | 2 +-
 .../attention/hip_fmha/ck_attention_forward_decoder_splitk.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 57d54eda2..cc6cdebbc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -6,7 +6,7 @@
  */
 #pragma once
 
-#include <ck/host_utility/kernel_launch_hip.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
 #include <ck/stream_config.hpp>
 #include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/utility/data_type.hpp>
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 3efe1385c..6d18846e7 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ck/host_utility/kernel_launch_hip.hpp>
+#include <ck/host_utility/kernel_launch.hpp>
 #include <ck/stream_config.hpp>
 #include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/utility/data_type.hpp>

From 410757e79eb2904e5c1d8b90e8d1a6a21190d930 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 9 May 2024 08:34:06 +0000
Subject: [PATCH 540/641] Position the ck-tiled to ck_tile/opt_padding branch

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index e761e7598..f9d0b3979 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = ck_tile/opt_padding_fa_train
+	branch = ck_tile/opt_padding
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index d1da1e311..dca9abd86 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit d1da1e311891243948c51ea6b58861ceadfd4000
+Subproject commit dca9abd86e6c601792f9ce704b6b2c18de081cb1

From 92924d4e8b60b5b19ec5a9e37ca3888db703f0b5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 11 May 2024 14:14:40 +0000
Subject: [PATCH 541/641] Enable some attn_bias types which were previously
 disabled by old-ck in ck.py

---
 xformers/ops/fmha/ck.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index acc06f438..9a2330f49 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -365,16 +365,14 @@ class BwOp(AttentionBwOpBase):
         type(None),
         torch.Tensor,
         LowerTriangularMask,
-        # LowerTriangularFromBottomRightMask,
-        # TODO: Still some infs/nans in the BW pass for
-        # local + causal
-        # LowerTriangularFromBottomRightLocalAttentionMask,
+        LowerTriangularFromBottomRightMask,
+        LowerTriangularFromBottomRightLocalAttentionMask,
         # TODO: Fix handling of gradient through the fMHA autograd function
         # LowerTriangularMaskWithTensorBias,
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
-        # attn_bias.BlockDiagonalCausalLocalAttentionMask,
+        attn_bias.BlockDiagonalCausalLocalAttentionMask,
     }
     SUPPORTS_ATTN_BIAS_GRAD = True
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT

From 23f64bd0ae6e06296d570f08d1d52bf1bed2ad56 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 14 May 2024 15:07:45 +0000
Subject: [PATCH 542/641] Add script generate_instances.py which helps to
 generate instances

---
 .../attention/hip_fmha/generate_instances.py  | 192 ++++++++++++++++++
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   7 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   9 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   9 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   9 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   7 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   9 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   9 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   9 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   9 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   9 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   9 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   9 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   9 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   9 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   7 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   7 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   7 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   9 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   9 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   9 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   7 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   9 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   9 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   9 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   9 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   9 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   9 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   9 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   9 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   9 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   7 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   7 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   7 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   7 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   7 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   7 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   5 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   5 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   5 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   7 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   7 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   7 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   5 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   5 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   5 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   5 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   5 +-
 401 files changed, 1998 insertions(+), 606 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/generate_instances.py
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (89%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (88%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp => fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (88%)

diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
new file mode 100644
index 000000000..f835ad82f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import os
+from pathlib import Path
+
+FMHA_INSTANCE_HEADER = """
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+"""
+
+FMHA_INFER_INSTANCE_TEMPLATE="""
+#include <ck/ck.hpp>
+#include \"ck_tiled_fmha_{mode}_infer.h\"
+
+template void run_{mode}_infer_causalmask_bias_dropout_dispatch<
+    {dtype},
+    {has_causalmask},
+    {has_bias},
+    {has_dropout},
+    {max_k}>({cap_mode}ForwardParams& param, hipStream_t stream);
+"""
+
+FMHA_INFER_INSTANCE_FNAME="fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+
+FMHA_FORWARD_INSTANCE_TEMPLATE="""
+#include <ck/ck.hpp>
+#include \"ck_tiled_fmha_{mode}_forward.h\"
+
+template void run_{mode}_forward_causalmask_bias_dropout_dispatch<
+    {dtype},
+    {has_causalmask},
+    {has_bias},
+    {has_dropout},
+    {max_k}>({cap_mode}ForwardParams& param, hipStream_t stream);
+"""
+
+FMHA_FORWARD_INSTANCE_FNAME="fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+
+FMHA_BACKWARD_INSTANCE_TEMPLATE="""
+#include <ck/ck.hpp>
+#include \"ck_tiled_fmha_{mode}_backward.h\"
+
+template void run_{mode}_backward_causalmask_bias_dropout_dispatch<
+    {dtype},
+    {has_causalmask},
+    {has_bias},
+    {has_bias_grad},
+    {has_dropout},
+    {max_k}>({cap_mode}BackwardParams& param, hipStream_t stream);
+"""
+
+FMHA_BACKWARD_INSTANCE_FNAME="fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+
+BOOL_MAP = {
+    True : "true",
+    False : "false"
+}
+
+BOOL_MAP_CAUSALMASK = {
+    True : "has_causalmask",
+    False : "no_causalmask",
+}
+
+BOOL_MAP_BIAS = {
+    True :  "has_bias",
+    False : "no_bias", 
+}
+
+BOOL_MAP_BIASGRAD = {
+    True : "has_biasgrad", 
+    False : "no_biasgrad",
+}
+
+BOOL_MAP_DROPOUT = {
+    True : "has_dropout", 
+    False : "no_dropout",
+}
+
+INT_MAP_MAX_K = {
+    32 : "maxk_32",
+    64 : "maxk_64",
+    128 : "maxk_128",
+    256 : "maxk_256",
+}
+
+TYPE_CTYPE_MAP = {
+   "fp16" : "ck::half_t", 
+   "bp16" : "ck::bhalf_t",
+}
+
+MODE_NAME_MAP = {
+    "batched" : "Batched",
+    "grouped" : "Grouped", 
+}
+
+def create_infer_instances(instance_dir: Path) -> None:
+     for mode in ["batched", "grouped"]: 
+         for dtype in ["fp16", "bp16"]: 
+             for has_causalmask in [True, False]: 
+                 for has_bias in [True, False]:
+                     for has_dropout in [True, False]:
+                         for max_k in [32, 64, 128, 256]: 
+                              fname = FMHA_INFER_INSTANCE_FNAME.format(
+                                      mode=mode,
+                                      dtype_str=dtype, 
+                                      has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                      has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
+                                      has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
+                                      max_k_str=INT_MAP_MAX_K[max_k],
+                                      )
+                              infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
+                                      mode=mode,
+                                      dtype=TYPE_CTYPE_MAP[dtype], 
+                                      has_causalmask=BOOL_MAP[has_causalmask],
+                                      has_bias=BOOL_MAP[has_bias],
+                                      has_dropout=BOOL_MAP[has_dropout],
+                                      max_k=max_k,
+                                      cap_mode=MODE_NAME_MAP[mode], 
+                                      )
+                              (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance) 
+
+def create_forward_instances(instance_dir: Path) -> None:
+     for mode in ["batched", "grouped"]:
+         for dtype in ["fp16", "bp16"]:
+             for has_causalmask in [True, False]:
+                 for has_bias in [True, False]:
+                     for has_dropout in [True, False]:
+                         for max_k in [32, 64, 128, 256]:
+                              fname = FMHA_FORWARD_INSTANCE_FNAME.format(
+                                      mode=mode,
+                                      dtype_str=dtype,
+                                      has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                      has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
+                                      has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
+                                      max_k_str=INT_MAP_MAX_K[max_k],
+                                      )
+                              infer_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
+                                      mode=mode,
+                                      dtype=TYPE_CTYPE_MAP[dtype],
+                                      has_causalmask=BOOL_MAP[has_causalmask],
+                                      has_bias=BOOL_MAP[has_bias],
+                                      has_dropout=BOOL_MAP[has_dropout],
+                                      max_k=max_k,
+                                      cap_mode=MODE_NAME_MAP[mode],
+                                      )
+                              (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+
+def create_backward_instances(instance_dir: Path) -> None:
+     for mode in ["batched", "grouped"]:
+         for dtype in ["fp16", "bp16"]:
+             for has_causalmask in [True, False]:
+                 for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
+                     for has_dropout in [True, False]:
+                         for max_k in [32, 64, 128]:
+                              fname = FMHA_BACKWARD_INSTANCE_FNAME.format(
+                                      mode=mode,
+                                      dtype_str=dtype,
+                                      has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                      has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
+                                      has_or_no_biasgrad_str=BOOL_MAP_BIASGRAD[has_bias_grad],
+                                      has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
+                                      max_k_str=INT_MAP_MAX_K[max_k],
+                                      )
+                              infer_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
+                                      mode=mode,
+                                      dtype=TYPE_CTYPE_MAP[dtype],
+                                      has_causalmask=BOOL_MAP[has_causalmask],
+                                      has_bias=BOOL_MAP[has_bias],
+                                      has_bias_grad=BOOL_MAP[has_bias_grad],
+                                      has_dropout=BOOL_MAP[has_dropout],
+                                      max_k=max_k,
+                                      cap_mode=MODE_NAME_MAP[mode],
+                                      )
+                              (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+
+if __name__ == "__main__":
+    this_dir = os.path.dirname(__file__)
+    output_dir = Path(this_dir) / "instances"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    create_infer_instances(output_dir)
+    create_forward_instances(output_dir)
+    create_backward_instances(output_dir)
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 53ab69fc2..f47ea8913 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 17e2eef9a..80872bc87 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index e5903a262..1b7eb3fa1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 6d1a95675..fbcbc8673 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
     true,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 7c827865f..b7183ced4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 46de1be23..0a5135581 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
-    false,
     true,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 0f2ad6e78..70d77321e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 2c227abf2..946da70a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
     true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 34e32791e..a10d6a1bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
-    true,
     false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 6186abdf8..74a45b99b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     false,
-    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 5619a5029..002b30ee5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 6b04d766a..0c4b5c1b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index afe52ab8b..b3a40e957 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 175fbaf4d..25b8ae47d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 62f5b6e56..ac8b00115 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 693ac4f26..f4ab60aed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 74e6105e7..40a92b384 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index fa3b403a3..aac83e1bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 3d93e9168..752e5a535 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
     true,
-    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 746539438..2296da150 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 7375b1aca..68876d1ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index fea36c72b..dcb2b0696 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
     false,
-    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 9cf279c5b..1c7f28a08 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index c8e379d59..5100ac96b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
-    false,
     false,
     true,
+    true,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index d987a2516..489bdd9a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
     true,
-    true,
     false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index f570c926e..27ab35a1b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
-    true,
     false,
+    true,
     false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 463aa81de..d2508d993 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
-    true,
     false,
+    true,
     false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 2a535ec0c..795744d65 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
+    true,
     false,
     false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index aa754420b..7a45b95db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 04badab08..f98cac80b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 366e6a68e..5d626588b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 0f0c58743..babf14605 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 2a8279443..47eed928b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index c943f2ea3..de13cdfa0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 6cfe5c349..ffaf66bdf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 4c2d55d06..53446d60e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index c7c2bf020..78e737557 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 970c63e14..6253cb013 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index cbde5ad7f..0d4a36823 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index b382ff62f..0075f69c4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index d7b02b3c2..7988f3f3a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 490fe4261..a87360605 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 9b50b4648..2dd378e56 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index acce3f824..5882f0f74 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index bf3c4e2bb..4e8f74579 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 878d2b968..56f4ef231 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 5dea3b92d..3fe231753 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 614dc4af5..ea591609a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 4fc4e8bbd..465e3974e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 82ec79aca..cf441573a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 2d9fb867a..5bca9b8ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index fae40a708..6312622ff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 1bee92536..dc425e9db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
     false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index fe583539d..3fbea87ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
     false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 54b193591..ce9e7d257 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index acc06d663..f93820dbb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 349ef3190..07dabfa5f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 1dc265944..852b0339d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    false,
     true,
     true,
     false,
-    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index d6c19a81b..4874e14aa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    false,
     true,
     true,
     false,
-    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 290b1c60d..0036596a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    false,
     true,
     true,
     false,
-    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index f97b3829d..eea9ea776 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 42a2945dd..070ddddd6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index dd60fbab5..ad72c8f1a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index dc07dbddf..99a3acd4f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 0800dd7ca..89e517e75 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d0ea35d54..9120025dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 0da1f95e4..419a240bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 01c850509..d9d4eaba9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index b85f2ac56..a1bcfbd2b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index dd77dc88c..d86f207d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 30fc3c1dd..2fa1e6493 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index e6184baf5..2b9e3daef 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index dbf8459d2..2237719c1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 0bc2865fc..24b717342 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 9390f08a4..d9333c0dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index dea796009..2fbb4d47c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index c2a2db586..5b609eb20 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 0c4156faf..6d08b4bb7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    false,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index dfd127839..6daa3edac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    false,
     true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 95731a02e..728b653c6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 18ace4cc5..6af1255c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    true,
     false,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index bc20e97bd..66c4450b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    false,
     true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index d6709f88e..8d6bc812f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    false,
     true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 3b52555be..cd43accf2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index a4ca78d9e..8d3003cde 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index e515cfbb5..f28877eeb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 7f573e21e..49108e76d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 6980a4141..ffc65eed8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 3c274c3d6..1d79adfb8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     true,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 1dc1c67ed..6fe3e9c9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 16f51cf1a..90d4de433 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 95eb46660..2e654d8a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index a6784236f..1c620930e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index df6c6c72d..5dd149303 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 775c6c1b1..32c7ea50f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
+    true,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index b2ef9186f..8f41bf550 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 657a99865..063359755 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 263d46e27..2a3207554 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 394728af1..3da70de62 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 4a6a7ee89..4e19f3be9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 4abe212c7..4a4f30052 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index bab70f814..436b9099f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 8b8cc0a16..5ab62c09b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index c2f4badc4..f1c11f424 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 249c4f425..db8135481 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 33ea7c25a..814b9d8ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index fcc6ac153..6576c4e2d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index f7547b577..4bf477d19 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index dd28c7c87..310a03420 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 808d4e710..fda6ea614 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 72c6714a5..121d264a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index f0c6d5967..ca98bf25a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 5f0d70239..a4881489d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 0ac3953bc..7a8d21150 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index f40ba4ec3..2d8c78b9e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    false,
     true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 8ea49cdfd..db9d24e33 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index bd319545a..e917e4574 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 97f7fbd46..170647a65 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 5edd0cd40..acdb267fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 4e0f85734..14c01441b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index ca332b921..c87a853a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 2791fc6ff..62d6f3f14 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 22586dc95..73dc87fc1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
     true,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 03a78009e..dacb7ed77 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 505d4d048..f535ef4f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index a438cca43..de1bbe73f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 96fd2bbb2..ad9d39793 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 4a5105996..5f040fa03 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index da15841a3..c6171c350 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index f2ba8c911..5518daba3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 93ef1d810..0607c2325 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index ab6382b62..e0e156802 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 84deea900..22082a993 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index cf24162f4..e52ed1a52 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 392151f6d..37bee2973 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 2960c998b..3deec3078 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 936789b59..8923f4008 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 26454ef59..c21f4dcdd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 97272b032..40483eab7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 913afceaf..319648375 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index d3d4f0823..b0928ecfc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 152c34e56..990cc05ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index ace85cec2..f15d45e69 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    false,
     true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 632fb0794..640f9fe2d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index b8a1fde66..9597383c9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 76b569cff..fe8993be4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 2db0507bd..164c45405 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    false,
     true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 3f1df08f6..7f7f9af7d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 5a19fe469..a73c01e2e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 0d9edb15d..e7234ebc2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 25928ff52..64dbc7049 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 823e9e1d1..5a609eaf0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 6b547e34e..c101ff149 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index a11984f7a..98f6d6723 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 1712a317d..627f4ea61 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index f9b0d1519..c7263bc26 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 109a6e914..fabe89504 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index b278bde42..ca31525f0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 23f5e10f7..59474b191 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 7e62dfe1f..802214815 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index eafa8238e..9bc056102 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 5528f22dd..001805e8a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index ceaa26f4d..3384be9d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index e87f2672b..be5ece1fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 6fda3ae54..ccf7cb80b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index fcc5a2bd8..4d13af6bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index cd7c4681b..2b8202b53 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index a2510ef7d..38fe474db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 91fa9cfb8..3a03e2ed1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index a8db3c21e..74cf62de8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index cf70efd4e..3d17dc729 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 2699d7a96..49ef6a3ed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 98cdea404..6e9e3b2ab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 10444d7d8..1980128a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index d70389373..cefda7208 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index a6d22c666..718293285 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 6ba251a1a..f45e10da9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 8da1f1e38..8c8d08f52 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index bb22a42a0..59ac4bc28 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index ff98dd555..edff64b7b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index ae7739be4..b27270cc4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 3594e81fd..34a7b746f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index e4fb8dbad..c8d2c42e1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index a15494b0f..747ad6cf2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 2d60996b8..83cdbd0e3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 3a39fb4ae..e72ef8963 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 1951d311c..1269c0e74 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 4557fe7aa..55a152e43 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index b310ad71f..a348774eb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 4e0ab2c07..95a57bb7d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 4e3d7c989..5573f81b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index e619bcb8d..c8eaea6a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 81607aa68..347120778 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 86e5b5a66..b3542bbf9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 07d487f6e..829f61029 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 83043e1c5..a5c71f3a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index f6ffe4963..51dd2f78f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 3b57b10ce..51c34e651 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 00872610f..700f9acfd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 0d69fcda0..4d43ed9b5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 529a8931c..f6d0af717 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index eca64f382..a73f1e9e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 03de22668..2e186f3ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 9cf7db73f..5ebed8c73 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
     true,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index eac3e148d..9e278d05d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 1d0d05754..452f5ac0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
-    false,
     true,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 3b24da32a..120ced112 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index a8d69e619..cbdac868f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
     true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index bd0ce8e79..95cd67300 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
     true,
-    true,
     false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 4f39ed253..8da955f15 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
+    true,
     false,
     false,
-    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 305697e7b..c77696023 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index ad7cdd703..4527adc28 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 148314356..3e125e542 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 0f586cdc4..4323e2902 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 4a3d28b51..eb4713c43 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index fff043eb0..35041c002 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 8721df90e..a4fe43dd5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 08646ecca..d875a8cb9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index be2d54836..307acb781 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
     true,
-    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index ec9f2db83..875c36554 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 4391d4d7d..d5e242fec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 7028cb7dc..fc0636bb7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
     false,
-    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index a67bb299d..adaee823c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 88ac4b243..1228d91c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
-    false,
     false,
     true,
+    true,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 5343b0c3a..42be3cb81 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
     true,
-    true,
     false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 0a15c5dad..7cf70379f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
-    true,
     false,
+    true,
     false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 01d422c00..d47bb845b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
-    true,
     false,
+    true,
     false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index d1d05d05a..87da66276 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
+    true,
     false,
     false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index b15836d17..1a67c23b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index e671f3ca2..bd7697091 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index e9f870c4c..115f80da5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 66fc7c9b3..31ee39fb2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 5001ac06e..258db9fce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 98836e82a..b848cecf7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 696e14ca3..89da82e0f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 1e1226c57..41d42b992 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 9b7520411..cde7b8f08 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 40c3e2566..c2298cb86 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 4c1939000..8342afa37 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index c259e3b89..834b1d625 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 8e6d377fc..0656ea175 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index c5ec3f4fb..6bb731da4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index bfc021bc9..fb458f74c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 76d4ae719..9536035d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index a3b402dfa..666ae6242 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 84478d932..d24d3d0f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 574a1271b..82740f8dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 534684ec4..7cfa9ecab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
-    false,
     true,
+    false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index dca1cfdae..0f12efbed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 0da0b4fd4..88d34ede5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 5fb6beace..ed0c9af4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
-    true,
     true,
     false,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index a70c75ccf..597c93939 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 62437cb36..0fe702a09 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
     false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d91b9c648..e5ab9b62c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
     false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index ee30cdf9f..582dd07ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 68996ba94..4cf3d362e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 90e924410..3c0e08ef5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_attnbias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 9b04b655a..be449dddb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    false,
     true,
     true,
     false,
-    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index b58450208..8e56f25d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    false,
     true,
     true,
     false,
-    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index b77d5ceaf..c4ed120c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_attnbias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
+    false,
     true,
     true,
     false,
-    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index b4a55a585..05ccb961b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 7d2ed485a..ab7a421fc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 8ff66d0b0..810225ab7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index ba4dee3e8..2f5ad17f5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 9f968835e..590b22987 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index bea50e4e6..07d372940 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,17 +1,20 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index cc82da7ef..c65c96f5d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 7a389f87d..e4aa0ac8a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 2bac6d9f8..63d619d8d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index cff4bd138..905448129 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 1173b7292..a5c107a93 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 89%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 8159058ba..a9245471c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index e801c3f93..780d6bc5d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index da3f9451c..597de4543 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 097cc7bf6..5608da950 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 26f0cb5ec..e67cfe516 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index ec2af1f10..70657a16c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 9a7c28fb5..e62a0cdfc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    false,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index e8e1a889f..1378e8bbe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    false,
     true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index a402d9805..2532a0074 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 48887ba1b..f404b2974 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    true,
     false,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 44f5e1e41..c027178b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    false,
     true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 498e15bcd..0f0174653 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    false,
     true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index cf0245833..1ce86be18 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index ccf7b1e1f..6ef0db716 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 1c0dee6a3..1da195796 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index d7fdf6789..5cd3ef7d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index b91e4a3ea..13cae6aea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index d5f2785d7..809a3597b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     true,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 8b49d8374..ecfe07e63 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 49402375a..4a1b10da6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index e08bd87d2..301590433 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 4a208cf12..6a65e56bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 07b92f6fb..95fc499b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 533d97a53..e898330a9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
+    true,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 21a57dfca..f6ebe8228 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index ba58b2a3a..cf15fa390 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 3f472877d..5677ead04 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index d561c4e08..53c4b4f84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 48672f2e0..70f34bc04 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 7088d0d9d..c74bdd1da 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index f4cc5ac8f..79ad692ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 2f8b750df..c44fe5e4e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index ac9d81f95..151d072b2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index c9b178a76..3cbe18117 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 82533dfa9..65fd33d2d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 090d3465d..cb9498401 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 99bf4bee6..7ddd09ca5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 2290c9410..1c5e308f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index a685ec502..1a674ad11 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 22e90a4cc..60d724d37 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index b44e85089..9c1268211 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index c9742c970..0972c088b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index dab84d1f5..c7bee6428 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 686f65bca..0dfdb53bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
+    false,
     true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 79a9ecc5e..bb1cf0032 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 6b851c95d..c9d7245e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 83b4ca32e..13cf18b74 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 35472c1e8..1d10b1934 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index c4f645028..239cfdcb7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 5fe2e08fc..0417713d5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index f645e1473..917fee0d4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 109bf6cdc..45c72d311 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
     true,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index f7aa2630b..11ef78e80 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index c6d8e12e2..9d258a09e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index cdd4a6b4f..63c04b163 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 7e1478866..38c0fdfb7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index a98daba6c..7620830c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 72022fb98..ca03aa0a8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 48d249424..0f8d631d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 0207a2691..9aca2c81e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 8cdf11645..f61fe5eeb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 137412fd9..a6523f6fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index a1fccefe0..c45de9a85 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 273593b9d..aa482cddc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 8b638fa32..32c319a50 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 32a098714..018cb72be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index b67cc8ca6..faabed60a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 77ecf2f4a..c920dff22 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index efae07d30..4e8d812c8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index b8221e500..06e096f9d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 47c79b1af..bdee87bc7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 9c3081f7a..489521a75 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     true,
-    false,
     true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 078c81ca0..93211cdd1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 13205e8c4..e3a658748 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index e399bfbce..3fa6d85bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 463a621af..3b5614f0e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
+    false,
     true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 60e847191..b33221834 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index e25c9ece7..0af311aa8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 093395947..d68e89d55 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 3724a2886..ea765be5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index a96ab0ce5..ee1dbceea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    false,
     true,
     false,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 5b000a628..055c3ddf6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 8f5458f9a..f2611fd2c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index d64878a93..4909cfa45 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
+    false,
     true,
     true,
-    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index f53906c82..4705a9d4e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index f18bf1e8f..ad7ce669e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index cd0336e0d..83e19ecfc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index baf202b49..a1c40a7f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 65c0c923d..37b634b55 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
     false,
-    false,
     true,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index f030cbb00..85f34fba8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index efc5b625a..69835203f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 0b7037cec..7fa077699 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 7301fdb10..dc34c1a04 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::bhalf_t,
-    true,
     false,
     false,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index c9c1b385b..5d75d9437 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 4a5e084d9..9af2dd0ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index ae7440bf9..92bc89ea5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 5f6048cbb..a2b3fd2a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 0ea9c2176..916786bff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index bc668d784..dac24a533 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index f2375b0a7..c99321f42 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 66de4bf3d..306b2de2a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index dce9620da..5a8431fe5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index eaa255d2a..29d76c352 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 1c1cee370..9475e9edd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 53434b15a..adb2f5ad1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 5a2c266d6..524a21c34 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index e8f0b6908..12eb1d0e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index b316aa818..26f6190d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 3cc34095b..111473c7e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 1c9c324f6..9adb10a8c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index e08afd8c0..6b7f35fa4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 3289a3109..e89cffda5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 1c6cd7d3e..7b4552d93 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    false,
     true,
     false,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 886537fad..734b7e5a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 3d72a5909..2644e4796 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 822dabadd..cba7af09d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 8ad64cd69..1755388bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 069aa9ed6..24074346e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index d09b9b0c0..609ee02ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 64d6034b4..56debfe4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index fac8e1cfa..454733419 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,16 +1,19 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck::half_t,
-    true,
     false,
+    true,
     false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index fbf764fc5..de325b10c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 5fed583d5..40754cdd3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 1825795eb..9e27756bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 45b21a50c..4000c08c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index e6a42bcc4..089d46191 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 592ad3232..6a6e96ff8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index af45ae222..fb8604451 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 88%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 03b28b79d..6a1ae5649 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_attnbias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,11 +1,14 @@
+
 /*
  * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
  */
-#include <ck/ck.hpp>
 
+#include <ck/ck.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<

From d94b2c1d8251b29e16fd61bceb9a0a6deab4be8c Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xw285@cornell.edu>
Date: Wed, 15 May 2024 00:27:18 -0700
Subject: [PATCH 543/641] Simplify logic for seqstart_q/k

https://github.com/ROCm/xformers/commit/566d26ff8009bf27535fa0798763fd1fdb271087 has put the seqstart_k/q on device. So simplify the logic here.

The upstream xformers don't have this optmization and is copying the seqstart_q/k every iterations. We'd like this change to get in and then merge to upstream.
---
 .../attention_forward_generic_ck_tiled.cpp    | 42 +++----------------
 1 file changed, 6 insertions(+), 36 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index de1e65dc2..b78da0d4b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -95,6 +95,8 @@ efficient_attention_forward_ck(
     TORCH_CHECK(seqstart_q->size(0) == seqstart_k->size(0));
     TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
     TORCH_CHECK(max_seqlen_q_.has_value());
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqstart_k));
   };
 
   // last dim is contiguous, device is kCUDA
@@ -290,48 +292,16 @@ efficient_attention_forward_ck(
     at::Tensor dev_seqstart_k;
     at::Tensor dev_seqlen_k;
 
-    if (seqstart_q->is_cpu()) {
-      dev_seqstart_q = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-      p.seqstart_q_dev_ptr = dev_seqstart_q.data_ptr();
-      HIP_CALL_CHECK(hipMemcpyAsync(
-          p.seqstart_q_dev_ptr,
-          seqstart_q->data_ptr(),
-          (p.num_batches + 1) * sizeof(int),
-          hipMemcpyHostToDevice,
-          stream));
-    } else
-      p.seqstart_q_dev_ptr = seqstart_q->data_ptr();
-
-    if (seqstart_k->is_cpu()) {
-      dev_seqstart_k = at::empty({p.num_batches + 1}, opts.dtype(at::kInt));
-
-      p.seqstart_k_dev_ptr = dev_seqstart_k.data_ptr();
-      HIP_CALL_CHECK(hipMemcpyAsync(
-          p.seqstart_k_dev_ptr,
-          seqstart_k->data_ptr(),
-          (p.num_batches + 1) * sizeof(int),
-          hipMemcpyHostToDevice,
-          stream));
-    } else
-      p.seqstart_k_dev_ptr = seqstart_k->data_ptr();
+    p.seqstart_q_dev_ptr = seqstart_q->data_ptr();
+    p.seqstart_k_dev_ptr = seqstart_k->data_ptr();
 
     if (seqlen_k.has_value()) {
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
       TORCH_CHECK(seqlen_k->dim() == 1);
       TORCH_CHECK(seqlen_k->size(0) == p.num_batches)
+      CHECK_NOSPARSE_CONTIGUOUS_CUDA((*seqlen_k));
 
-      if (seqlen_k->is_cpu()) {
-        dev_seqlen_k = at::empty({p.num_batches}, opts.dtype(at::kInt));
-
-        p.seqlen_k_dev_ptr = dev_seqlen_k.data_ptr();
-        HIP_CALL_CHECK(hipMemcpyAsync(
-            p.seqlen_k_dev_ptr,
-            seqlen_k->data_ptr(),
-            p.num_batches * sizeof(int),
-            hipMemcpyHostToDevice,
-            stream));
-      } else
-        p.seqlen_k_dev_ptr = seqlen_k->data_ptr();
+      p.seqlen_k_dev_ptr = seqlen_k->data_ptr();
     } else
       p.seqlen_k_dev_ptr = nullptr;
 

From 2486b568f701c1f4e3371edcad18bf2cde6c5307 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 15 May 2024 14:50:00 +0000
Subject: [PATCH 544/641] Add Async pipeline to grouped mode inference path

---
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 100 ++++++++++++------
 1 file changed, 68 insertions(+), 32 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 2a1c02b4e..901fff588 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -16,6 +16,7 @@
 #include <ck/tile_program/block_tile/block_masking.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
 #include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
+#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
 #include <ck/tile_program/tile/tile_fmha_shape.hpp>
 #include <ck/tile_program/tile/tile_fmha_traits.hpp>
 
@@ -73,38 +74,73 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
 
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
-
-      BOOL_SWITCH_2(
-          pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-            using FmhaTraits = ck::tile_program::TileFmhaTraits<
-                kPadSeqLenQ,
-                kPadSeqLenK,
-                kPadHeadDimQ,
-                kPadHeadDimV,
-                kHasBias,
-                false, // kHasBiasGrad place-holder
-                false, // kStoreLSE
-                kHasDropout,
-                occupancy>;
-
-            using FmhaPipelineProblem =
-                FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
-
-            using FmhaPipeline =
-                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                    FmhaPipelineProblem>;
-
-            using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                kPadSeqLenQ,
-                kPadHeadDimV>>;
-
-            using FmhaKernel =
-                FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-            RunWithKernel<FmhaKernel>(param, stream);
-          });
+      const bool use_async_pipeline =
+          ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+
+      if (!use_async_pipeline) {
+        BOOL_SWITCH_2(
+            pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
+              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+                  kPadSeqLenQ,
+                  kPadSeqLenK,
+                  kPadHeadDimQ,
+                  kPadHeadDimV,
+                  kHasBias,
+                  false, // kHasBiasGrad place-holder
+                  false, // kStoreLSE
+                  kHasDropout,
+                  occupancy>;
+
+              using FmhaPipelineProblem =
+                  FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+              using FmhaPipeline =
+                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
+                      FmhaPipelineProblem>;
+
+              using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+                  typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                  typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                  kPadSeqLenQ,
+                  kPadHeadDimV>>;
+
+              using FmhaKernel = FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaPipeline,
+                  FmhaEpilogue>;
+
+              RunWithKernel<FmhaKernel>(param, stream);
+            });
+      } else {
+        using FmhaTraits = ck::tile_program::TileFmhaTraits<
+            kPadSeqLenQ,
+            kPadSeqLenK,
+            true,
+            true,
+            kHasBias,
+            false, // kHasBiasGrad place-holder
+            false, // kStoreLSE
+            kHasDropout,
+            occupancy>;
+
+        using FmhaPipelineProblem =
+            FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
+
+        using FmhaPipeline =
+            ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
+                FmhaPipelineProblem>;
+
+        using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
+            typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+            typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+            kPadSeqLenQ,
+            true>>;
+
+        using FmhaKernel =
+            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+        RunWithKernel<FmhaKernel>(param, stream);
+      }
     });
   };
 

From 18b43c930502d65b623d7a03457952050362b5cb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 15 May 2024 15:10:50 +0000
Subject: [PATCH 545/641] Use explict true for
 kPadSeqLenQ/kPadHeadDimQ/kPadHeadDimV templates for the Async pipeline

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 901fff588..e26937576 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -113,10 +113,10 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
             });
       } else {
         using FmhaTraits = ck::tile_program::TileFmhaTraits<
-            kPadSeqLenQ,
+            true, // kPadSeqLenQ,
             kPadSeqLenK,
-            true,
-            true,
+            true, // kPadHeadDimQ,
+            true, // kPadHeadDimV,
             kHasBias,
             false, // kHasBiasGrad place-holder
             false, // kStoreLSE
@@ -133,7 +133,7 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
         using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
             typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
             typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-            kPadSeqLenQ,
+            true,
             true>>;
 
         using FmhaKernel =

From 14f7abe0d100a87ea58f790e3fae6aeb8c2c39df Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 21 May 2024 14:30:53 +0000
Subject: [PATCH 546/641] Synchronize to the update of composable_kernel_tiled
 for better performance

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index dca9abd86..b79327f6e 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit dca9abd86e6c601792f9ce704b6b2c18de081cb1
+Subproject commit b79327f6eead6c71bb7f85954516198a2b7b6a6f

From ee4aa871b31641691c8e7cd4ed42ea2a108d558a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 23 May 2024 11:25:38 -0700
Subject: [PATCH 547/641] Update rocm_ci.yml - clean up dangling images after
 ci run

---
 .github/workflows/rocm_ci.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index fc6946a9c..904234505 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -81,4 +81,11 @@ jobs:
     - name: Process test results
       run: |
         echo "Processing test results TBD"
-      
+
+  clean:
+    runs-on: self-hosted
+    needs: [build]
+    steps:
+      - name: Remove dangling Docker images
+        run: |
+          docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi

From b0b5547a594bb0f1c652a98e6e7889bf3573bea1 Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xw285@cornell.edu>
Date: Sat, 25 May 2024 13:52:30 -0700
Subject: [PATCH 548/641] Avoid unused-const-variable warning

Our compiler will error on unused-const-variable warning. So just fix this
---
 .../csrc/attention/hip_fmha/attention_forward_decoder.cpp     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 6fe0137b0..567a7bb5f 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -57,7 +57,7 @@ template <
     int32_t ThreadsPerWavefront,
     int32_t WavefrontsPerBlock,
     int32_t KV_M_MAX = 8192,
-    int32_t K_MAX = 256>
+    int32_t K_MAX = K_MAX>
 at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
     const at::Tensor& XQ, // [B, 1, G, H, D]
     const at::Tensor& cache_K, // [B, KV_M_MAX, G, H or 1, D]
@@ -330,4 +330,4 @@ int main(int argc, char** argv) {
   return 0;
 }
 
-#endif // MAIN
\ No newline at end of file
+#endif // MAIN

From dfc196d6162ccf9918ed4b599fd978699915d7e4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 29 May 2024 14:34:55 +0000
Subject: [PATCH 549/641] Tiny change in the BlockTile/Shape setting
 overriddings

---
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h      | 42 +++++++++++--------
 .../hip_fmha/ck_tiled_fmha_fwd_setting.h      | 25 +++++++----
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 08cb7ba2b..910b25f8f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -59,21 +59,27 @@ struct FmhaBwdBlockTile;
 template <>
 struct FmhaBwdBlockTile<32> {
   using type = ck::Sequence<128, 128, 32, 32, 32, 32, 32, 32, 32>;
+  using gemm02_warps = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck::Sequence<4, 1, 1>; // default for gemm4
 };
 
 template <>
 struct FmhaBwdBlockTile<64> {
   using type = ck::Sequence<64, 128, 32, 32, 32, 32, 32, 64, 64>;
+  using gemm02_warps = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck::Sequence<2, 2, 1>; // default for gemm4
 };
 
 template <>
 struct FmhaBwdBlockTile<128> {
   using type = ck::Sequence<64, 128, 32, 32, 32, 32, 32, 128, 128>;
+  using gemm02_warps = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck::Sequence<2, 2, 1>; // default for gemm4
 };
 
-using FmhaBwdBlockWarps0 = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
-using FmhaBwdBlockWarps1 = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
-using FmhaBwdBlockWarps2 = ck::Sequence<2, 2, 1>; // default for gemm4
 using FmhaBwdWarpTile = ck::Sequence<32, 32, 16>;
 
 template <ck::index_t MaxK>
@@ -82,43 +88,43 @@ struct FmhaBwdShape;
 template <>
 struct FmhaBwdShape<32> : ck::tile_program::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<32>::type,
-                              FmhaBwdBlockWarps0,
+                              typename FmhaBwdBlockTile<32>::gemm02_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps1,
+                              typename FmhaBwdBlockTile<32>::gemm13_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps0,
+                              typename FmhaBwdBlockTile<32>::gemm02_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps1,
+                              typename FmhaBwdBlockTile<32>::gemm13_warps,
                               FmhaBwdWarpTile,
-                              ck::Sequence<4, 1, 1>,
+                              typename FmhaBwdBlockTile<32>::gemm4_warps,
                               FmhaBwdWarpTile> {};
 
 template <>
 struct FmhaBwdShape<64> : ck::tile_program::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<64>::type,
-                              FmhaBwdBlockWarps0,
+                              typename FmhaBwdBlockTile<64>::gemm02_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps1,
+                              typename FmhaBwdBlockTile<64>::gemm13_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps0,
+                              typename FmhaBwdBlockTile<64>::gemm02_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps1,
+                              typename FmhaBwdBlockTile<64>::gemm13_warps,
                               FmhaBwdWarpTile,
-                              FmhaBwdBlockWarps2,
+                              typename FmhaBwdBlockTile<64>::gemm4_warps,
                               FmhaBwdWarpTile> {};
 
 template <>
 struct FmhaBwdShape<128> : ck::tile_program::TileFmhaBwdShape<
                                typename FmhaBwdBlockTile<128>::type,
-                               FmhaBwdBlockWarps0,
+                               typename FmhaBwdBlockTile<128>::gemm02_warps,
                                FmhaBwdWarpTile,
-                               FmhaBwdBlockWarps1,
+                               typename FmhaBwdBlockTile<128>::gemm13_warps,
                                FmhaBwdWarpTile,
-                               FmhaBwdBlockWarps0,
+                               typename FmhaBwdBlockTile<128>::gemm02_warps,
                                FmhaBwdWarpTile,
-                               FmhaBwdBlockWarps1,
+                               typename FmhaBwdBlockTile<128>::gemm13_warps,
                                FmhaBwdWarpTile,
-                               FmhaBwdBlockWarps2,
+                               typename FmhaBwdBlockTile<128>::gemm4_warps,
                                FmhaBwdWarpTile> {};
 
 template <ck::index_t MaxK>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
index 3810bd3d0..364226ebe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
@@ -49,24 +49,31 @@ struct FmhaFwdBlockTile;
 template <>
 struct FmhaFwdBlockTile<32> {
   using type = ck::Sequence<128, 64, 16, 32, 32, 32>;
+  using gemm0_warps = ck::Sequence<2, 1, 1>;
+  using gemm1_warps = ck::Sequence<2, 1, 1>;
 };
 
 template <>
 struct FmhaFwdBlockTile<64> {
   using type = ck::Sequence<128, 64, 32, 64, 32, 64>;
+  using gemm0_warps = ck::Sequence<4, 1, 1>;
+  using gemm1_warps = ck::Sequence<4, 1, 1>;
 };
 
 template <>
 struct FmhaFwdBlockTile<128> {
   using type = ck::Sequence<128, 128, 32, 128, 32, 128>;
+  using gemm0_warps = ck::Sequence<4, 1, 1>;
+  using gemm1_warps = ck::Sequence<4, 1, 1>;
 };
 
 template <>
 struct FmhaFwdBlockTile<256> {
   using type = ck::Sequence<128, 128, 32, 256, 32, 256>;
+  using gemm0_warps = ck::Sequence<4, 1, 1>;
+  using gemm1_warps = ck::Sequence<4, 1, 1>;
 };
 
-using FmhaFwdBlockWarps = ck::Sequence<4, 1, 1>;
 using FmhaFwdWarpTile = ck::Sequence<32, 32, 16>;
 
 static constexpr bool IsVLayoutRowMajor = true;
@@ -77,35 +84,35 @@ struct FmhaFwdShape;
 template <>
 struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<
                               typename FmhaFwdBlockTile<32>::type,
-                              ck::Sequence<2, 1, 1>,
+                              typename FmhaFwdBlockTile<32>::gemm0_warps,
                               FmhaFwdWarpTile,
-                              ck::Sequence<2, 1, 1>,
+                              typename FmhaFwdBlockTile<32>::gemm1_warps,
                               FmhaFwdWarpTile,
                               IsVLayoutRowMajor> {};
 
 template <>
 struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<
                               typename FmhaFwdBlockTile<64>::type,
-                              FmhaFwdBlockWarps,
+                              typename FmhaFwdBlockTile<64>::gemm0_warps,
                               FmhaFwdWarpTile,
-                              FmhaFwdBlockWarps,
+                              typename FmhaFwdBlockTile<64>::gemm1_warps,
                               FmhaFwdWarpTile,
                               IsVLayoutRowMajor> {};
 
 template <>
 struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<
                                typename FmhaFwdBlockTile<128>::type,
-                               FmhaFwdBlockWarps,
+                               typename FmhaFwdBlockTile<128>::gemm0_warps,
                                FmhaFwdWarpTile,
-                               FmhaFwdBlockWarps,
+                               typename FmhaFwdBlockTile<128>::gemm1_warps,
                                FmhaFwdWarpTile,
                                IsVLayoutRowMajor> {};
 
 template <>
 struct FmhaFwdShape<256> : ck::tile_program::TileFmhaShape<
                                typename FmhaFwdBlockTile<256>::type,
-                               FmhaFwdBlockWarps,
+                               typename FmhaFwdBlockTile<256>::gemm0_warps,
                                FmhaFwdWarpTile,
-                               FmhaFwdBlockWarps,
+                               typename FmhaFwdBlockTile<256>::gemm1_warps,
                                FmhaFwdWarpTile,
                                IsVLayoutRowMajor> {};

From f50861a58381bf74af761d922ed77c175cb830bd Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 12 Jun 2024 21:54:23 +0000
Subject: [PATCH 550/641] try to align fmha C++ extension to the ck_tile in ck
 develop branch

---
 .gitmodules                                   |   2 +-
 setup.py                                      |  11 +-
 third_party/composable_kernel_tiled           |   2 +-
 .../attention_backward_generic_ck_tiled.cpp   |   8 +-
 .../hip_fmha/attention_ck_rand_uniform.cpp    |  22 +-
 .../attention_forward_generic_ck_tiled.cpp    |  16 +-
 .../hip_fmha/ck_attention_forward_decoder.h   |   5 +-
 .../ck_attention_forward_decoder_splitk.h     |   5 +-
 .../hip_fmha/ck_attention_inner_product.h     | 351 +++++++++++++++++
 .../hip_fmha/ck_attention_math_ext.h          |  29 ++
 .../csrc/attention/hip_fmha/ck_fmha_util.h    |  48 ---
 .../attention/hip_fmha/ck_tiled_bool_switch.h |  69 +++-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 157 ++++----
 ...> ck_tiled_fmha_batched_backward_bf16.cpp} |  81 ++--
 .../ck_tiled_fmha_batched_backward_fp16.cpp   |  79 ++--
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  | 145 +++----
 ...=> ck_tiled_fmha_batched_forward_bf16.cpp} |  73 ++--
 .../ck_tiled_fmha_batched_forward_fp16.cpp    |  71 ++--
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    | 130 +++----
 ...p => ck_tiled_fmha_batched_infer_bf16.cpp} |  73 ++--
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  71 ++--
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h      | 124 +++---
 .../hip_fmha/ck_tiled_fmha_fwd_setting.h      |  69 ++--
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 157 ++++----
 ...> ck_tiled_fmha_grouped_backward_bf16.cpp} |  81 ++--
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   |  79 ++--
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 109 +++---
 ...=> ck_tiled_fmha_grouped_forward_bf16.cpp} |  73 ++--
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    |  71 ++--
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 130 +++----
 ...p => ck_tiled_fmha_grouped_infer_bf16.cpp} |  73 ++--
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  71 ++--
 .../hip_fmha/ck_tiled_headdim_switch.h        |  15 +-
 .../hip_fmha/ck_tiled_rand_uniform_kernel.h   | 354 ++++++++++++++++++
 .../attention/hip_fmha/generate_instances.py  |  26 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   6 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   8 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   8 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   8 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |  10 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |  10 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   8 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |  10 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   6 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   8 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   8 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   6 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |  10 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   8 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |  10 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |  10 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   6 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   6 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   6 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |   8 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   8 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   8 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |  10 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |  10 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   8 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |  10 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   6 +-
 ...ias_has_biasgrad_has_dropout_maxk_128.cpp} |   8 +-
 ...bias_has_biasgrad_has_dropout_maxk_32.cpp} |   8 +-
 ...bias_has_biasgrad_has_dropout_maxk_64.cpp} |   6 +-
 ...bias_has_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ..._bias_has_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_64.cpp} |  10 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   8 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |  10 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |  10 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   8 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   6 +-
 ...bias_no_biasgrad_has_dropout_maxk_128.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_32.cpp} |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_64.cpp} |   6 +-
 ..._bias_no_biasgrad_no_dropout_maxk_128.cpp} |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_32.cpp} |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_64.cpp} |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |   6 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |   6 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |   6 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |   6 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...salmask_has_bias_has_dropout_maxk_128.cpp} |   6 +-
 ...salmask_has_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...usalmask_has_bias_has_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_no_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_has_bias_no_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_has_bias_no_dropout_maxk_64.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_128.cpp} |   8 +-
 ...usalmask_no_bias_has_dropout_maxk_256.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_32.cpp} |   8 +-
 ...ausalmask_no_bias_has_dropout_maxk_64.cpp} |   8 +-
 ...ausalmask_no_bias_no_dropout_maxk_128.cpp} |   6 +-
 ...ausalmask_no_bias_no_dropout_maxk_256.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_32.cpp} |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_64.cpp} |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |   6 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |   6 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |   6 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |   6 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |   6 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |   6 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |   6 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |   6 +-
 435 files changed, 3086 insertions(+), 2438 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_attention_inner_product.h
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_attention_math_ext.h
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_batched_backward_bp16.cpp => ck_tiled_fmha_batched_backward_bf16.cpp} (71%)
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_batched_forward_bp16.cpp => ck_tiled_fmha_batched_forward_bf16.cpp} (73%)
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_batched_infer_bp16.cpp => ck_tiled_fmha_batched_infer_bf16.cpp} (73%)
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_grouped_backward_bp16.cpp => ck_tiled_fmha_grouped_backward_bf16.cpp} (71%)
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_grouped_forward_bp16.cpp => ck_tiled_fmha_grouped_forward_bf16.cpp} (73%)
 rename xformers/csrc/attention/hip_fmha/{ck_tiled_fmha_grouped_infer_bp16.cpp => ck_tiled_fmha_grouped_infer_bf16.cpp} (73%)
 create mode 100644 xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp => fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp => fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp => fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp} (75%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp} (74%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp => fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp} (74%)

diff --git a/.gitmodules b/.gitmodules
index f9d0b3979..6e56bcb9c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = ck_tile/opt_padding
+	branch = develop-xformers-test 
diff --git a/setup.py b/setup.py
index 9053e6dd2..07661243e 100644
--- a/setup.py
+++ b/setup.py
@@ -351,15 +351,6 @@ def get_extensions():
             Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
         ]
 
-        include_dirs += [
-            Path(this_dir)
-            / "third_party"
-            / "composable_kernel_tiled"
-            / "example"
-            / "91_tile_program"
-            / "fmha"
-        ]
-
         include_dirs += [
             Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
         ]
@@ -377,7 +368,7 @@ def get_extensions():
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-DCK_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
-                "-Rpass-analysis=kernel-resource-usage",
+                ##"-Rpass-analysis=kernel-resource-usage",
             ]
             + generator_flag
             + cc_flag,
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index b79327f6e..ed3a957f1 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit b79327f6eead6c71bb7f85954516198a2b7b6a6f
+Subproject commit ed3a957f1c49b6ac280e52d96dcceac920e582d9
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 2fe1150dc..c9494060b 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -20,13 +20,13 @@
 extern void batched_backward_fp16(
     BatchedBackwardParams& param,
     hipStream_t stream);
-extern void batched_backward_bp16(
+extern void batched_backward_bf16(
     BatchedBackwardParams& param,
     hipStream_t stream);
 extern void grouped_backward_fp16(
     GroupedBackwardParams& param,
     hipStream_t stream);
-extern void grouped_backward_bp16(
+extern void grouped_backward_bf16(
     GroupedBackwardParams& param,
     hipStream_t stream);
 
@@ -492,7 +492,7 @@ efficient_attention_backward_ck(
     if (inDataType == at::ScalarType::Half) {
       batched_backward_fp16(batched_backward_params, stream);
     } else if (inDataType == at::ScalarType::BFloat16) {
-      batched_backward_bp16(batched_backward_params, stream);
+      batched_backward_bf16(batched_backward_params, stream);
     } else
       throw std::runtime_error("input data-type is not supported");
   } else { // input is grouped
@@ -503,7 +503,7 @@ efficient_attention_backward_ck(
     if (inDataType == at::ScalarType::Half) {
       grouped_backward_fp16(grouped_backward_params, stream);
     } else if (inDataType == at::ScalarType::BFloat16) {
-      grouped_backward_bp16(grouped_backward_params, stream);
+      grouped_backward_bf16(grouped_backward_params, stream);
     } else
       throw std::runtime_error("input data-type is not supported");
   }
diff --git a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
index b3e241844..94a7250a6 100644
--- a/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_ck_rand_uniform.cpp
@@ -13,10 +13,10 @@
 #include <torch/types.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 
-#include <ck/ck.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
+#include <ck_tile/core.hpp>
+#include <ck_tile/host/kernel_launch.hpp>
 
-#include "fmha_rand_uniform_kernel.hpp"
+#include "ck_tiled_rand_uniform_kernel.h"
 
 namespace {
 
@@ -76,15 +76,13 @@ at::Tensor rand_uniform_int(
 
     dim3 kGridSize = FmhaRandUniformKernel_::GridSize(B, num_heads, M, N);
     constexpr dim3 kBlockSize = FmhaRandUniformKernel_::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaRandUniformKernel_::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaRandUniformKernel_{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu =
+        FmhaRandUniformKernel_::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaRandUniformKernel_{}, kGridSize, kBlockSize, 0, kargs));
   }
 
   (void)hipStreamSynchronize(stream);
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index b78da0d4b..fb29c7d21 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -24,20 +24,20 @@
 extern void batched_forward_fp16(
     BatchedForwardParams& param,
     hipStream_t stream);
-extern void batched_forward_bp16(
+extern void batched_forward_bf16(
     BatchedForwardParams& param,
     hipStream_t stream);
 extern void grouped_forward_fp16(
     GroupedForwardParams& param,
     hipStream_t stream);
-extern void grouped_forward_bp16(
+extern void grouped_forward_bf16(
     GroupedForwardParams& param,
     hipStream_t stream);
 
 extern void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream);
-extern void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream);
+extern void batched_infer_bf16(BatchedForwardParams& param, hipStream_t stream);
 extern void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream);
-extern void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream);
+extern void grouped_infer_bf16(GroupedForwardParams& param, hipStream_t stream);
 
 namespace {
 
@@ -342,14 +342,14 @@ efficient_attention_forward_ck(
       if (inDataType == at::ScalarType::Half) {
         batched_infer_fp16(batched_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_infer_bp16(batched_forward_params, stream);
+        batched_infer_bf16(batched_forward_params, stream);
       } else
         throw std::runtime_error("input data-type is not supported!");
     } else {
       if (inDataType == at::ScalarType::Half) {
         batched_forward_fp16(batched_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
-        batched_forward_bp16(batched_forward_params, stream);
+        batched_forward_bf16(batched_forward_params, stream);
       } else
         throw std::runtime_error("input data-type is not supported!");
     };
@@ -362,14 +362,14 @@ efficient_attention_forward_ck(
       if (inDataType == at::ScalarType::Half) {
         grouped_infer_fp16(grouped_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_infer_bp16(grouped_forward_params, stream);
+        grouped_infer_bf16(grouped_forward_params, stream);
       } else
         throw std::runtime_error("input data-type is not supported!");
     } else {
       if (inDataType == at::ScalarType::Half) {
         grouped_forward_fp16(grouped_forward_params, stream);
       } else if (inDataType == at::ScalarType::BFloat16) {
-        grouped_forward_bp16(grouped_forward_params, stream);
+        grouped_forward_bf16(grouped_forward_params, stream);
       } else
         throw std::runtime_error("input data-type is not supported!");
     };
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index cc6cdebbc..741eda2ef 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -10,9 +10,10 @@
 #include <ck/stream_config.hpp>
 #include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/utility/data_type.hpp>
-#include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
-#include <ck/utility/math_v2.hpp>
+
+#include "ck_attention_inner_product.h"
+#include "ck_attention_math_ext.h"
 
 namespace {
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index 6d18846e7..bb45f3796 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -4,9 +4,10 @@
 #include <ck/stream_config.hpp>
 #include <ck/tensor_operation/gpu/device/device_base.hpp>
 #include <ck/utility/data_type.hpp>
-#include <ck/utility/inner_product.hpp>
 #include <ck/utility/math.hpp>
-#include <ck/utility/math_v2.hpp>
+
+#include "ck_attention_inner_product.h"
+#include "ck_attention_math_ext.h"
 
 namespace {
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_inner_product.h b/xformers/csrc/attention/hip_fmha/ck_attention_inner_product.h
new file mode 100644
index 000000000..ec97bfdd0
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_inner_product.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <ck/ck.hpp>
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/type_convert.hpp>
+
+namespace ck {
+
+template <typename TA, typename TB, typename TC>
+__device__ void inner_product(const TA& a, const TB& b, TC& c);
+
+template <>
+__device__ void inner_product<float, float, float>(
+    const float& a,
+    const float& b,
+    float& c) {
+#if CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
+  asm volatile(
+      "\n \
+            v_mac_f32 %0, %1, %2 \n \
+            "
+      : "=v"(c)
+      : "v"(a), "v"(b), "0"(c));
+#elif CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
+  asm volatile(
+      "\n \
+            v_fmac_f32 %0, %1, %2 \n \
+            "
+      : "=v"(c)
+      : "v"(a), "v"(b), "0"(c));
+#else
+  c += a * b;
+#endif
+}
+
+template <>
+__device__ void inner_product<float2_t, float2_t, float>(
+    const float2_t& a,
+    const float2_t& b,
+    float& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+
+  inner_product(
+      vector_type<float, 2>{a}.AsType<float>()[I0],
+      vector_type<float, 2>{b}.AsType<float>()[I0],
+      c);
+
+  inner_product(
+      vector_type<float, 2>{a}.AsType<float>()[I1],
+      vector_type<float, 2>{b}.AsType<float>()[I1],
+      c);
+}
+
+template <>
+__device__ void inner_product<float4_t, float4_t, float>(
+    const float4_t& a,
+    const float4_t& b,
+    float& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+  constexpr auto I2 = Number<2>{};
+  constexpr auto I3 = Number<3>{};
+
+  inner_product(
+      vector_type<float, 4>{a}.AsType<float>()[I0],
+      vector_type<float, 4>{b}.AsType<float>()[I0],
+      c);
+
+  inner_product(
+      vector_type<float, 4>{a}.AsType<float>()[I1],
+      vector_type<float, 4>{b}.AsType<float>()[I1],
+      c);
+
+  inner_product(
+      vector_type<float, 4>{a}.AsType<float>()[I2],
+      vector_type<float, 4>{b}.AsType<float>()[I2],
+      c);
+
+  inner_product(
+      vector_type<float, 4>{a}.AsType<float>()[I3],
+      vector_type<float, 4>{b}.AsType<float>()[I3],
+      c);
+}
+
+template <>
+__device__ void inner_product<bhalf_t, bhalf_t, float>(
+    const bhalf_t& a,
+    const bhalf_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+__device__ void inner_product<half_t, half_t, float>(
+    const half_t& a,
+    const half_t& b,
+    float& c) {
+  inner_product(type_convert<float>(a), type_convert<float>(b), c);
+}
+
+template <>
+__device__ void inner_product<half2_t, half2_t, float>(
+    const half2_t& a,
+    const half2_t& b,
+    float& c) {
+#if defined(CK_USE_AMD_V_DOT2_F32_F16)
+#if CK_USE_AMD_V_DOT_INLINE_ASM
+  // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47
+  // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
+  // ) s_nop with parameter 2 is equal to 3 x s_nop
+  asm volatile(
+      "\n \
+            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            s_nop 2 \n \
+            "
+      : "=v"(c)
+      : "v"(a), "v"(b), "0"(c));
+#else
+  c = __builtin_amdgcn_fdot2(a, b, c, false);
+#endif
+#else
+  const vector_type<half_t, 2> a_vector{a};
+  const vector_type<half_t, 2> b_vector{b};
+
+  static_for<0, 2, 1>{}([&](auto i) {
+    c += type_convert<float>(a_vector.AsType<half_t>()[i]) *
+        type_convert<float>(b_vector.AsType<half_t>()[i]);
+  });
+#endif
+}
+
+template <>
+__device__ void inner_product<half4_t, half4_t, float>(
+    const half4_t& a,
+    const half4_t& b,
+    float& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+
+  inner_product(
+      vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
+      vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
+      vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
+      c);
+}
+
+template <>
+__device__ void inner_product<half8_t, half8_t, float>(
+    const half8_t& a,
+    const half8_t& b,
+    float& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+  constexpr auto I2 = Number<2>{};
+  constexpr auto I3 = Number<3>{};
+
+  inner_product(
+      vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
+      vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
+      vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
+      c);
+
+  inner_product(
+      vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
+      vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
+      c);
+
+  inner_product(
+      vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
+      vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
+      c);
+}
+
+template <>
+__device__ void inner_product<bhalf2_t, bhalf2_t, float>(
+    const bhalf2_t& a,
+    const bhalf2_t& b,
+    float& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+
+  inner_product(
+      vector_type<bhalf_t, 2>{a}.AsType<bhalf_t>()[I0],
+      vector_type<bhalf_t, 2>{b}.AsType<bhalf_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<bhalf_t, 2>{a}.AsType<bhalf_t>()[I1],
+      vector_type<bhalf_t, 2>{b}.AsType<bhalf_t>()[I1],
+      c);
+}
+
+template <>
+__device__ void inner_product<bhalf4_t, bhalf4_t, float>(
+    const bhalf4_t& a,
+    const bhalf4_t& b,
+    float& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+  constexpr auto I2 = Number<2>{};
+  constexpr auto I3 = Number<3>{};
+
+  inner_product(
+      vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I0],
+      vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I1],
+      vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I1],
+      c);
+
+  inner_product(
+      vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I2],
+      vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I2],
+      c);
+
+  inner_product(
+      vector_type<bhalf_t, 4>{a}.AsType<bhalf_t>()[I3],
+      vector_type<bhalf_t, 4>{b}.AsType<bhalf_t>()[I3],
+      c);
+}
+
+template <>
+__device__ void inner_product<int8_t, int8_t, int32_t>(
+    const int8_t& a,
+    const int8_t& b,
+    int32_t& c) {
+  c += type_convert<int32_t>(a) * type_convert<int32_t>(b);
+}
+
+template <>
+__device__ void inner_product<int8x2_t, int8x2_t, int32_t>(
+    const int8x2_t& a,
+    const int8x2_t& b,
+    int32_t& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+
+  inner_product(
+      vector_type<int8_t, 2>{a}.AsType<int8_t>()[I0],
+      vector_type<int8_t, 2>{b}.AsType<int8_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<int8_t, 2>{a}.AsType<int8_t>()[I1],
+      vector_type<int8_t, 2>{b}.AsType<int8_t>()[I1],
+      c);
+}
+
+template <>
+__device__ void inner_product<int8x4_t, int8x4_t, int32_t>(
+    const int8x4_t& a,
+    const int8x4_t& b,
+    int32_t& c) {
+#if defined(CK_USE_AMD_V_DOT4_I32_I8)
+#if CK_USE_AMD_V_DOT_INLINE_ASM
+  // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47
+  // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
+  // ) s_nop with parameter 2 is equal to 3 x s_nop
+  asm volatile(
+      "\n \
+            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            s_nop 2 \n \
+            "
+      : "=v"(c)
+      : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));
+#else
+  c = __builtin_amdgcn_sdot4(
+      bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
+#endif
+#elif defined(CK_USE_AMD_V_DOT4_I32_I8_GFX11)
+  c = __builtin_amdgcn_sudot4(
+      true, bit_cast<int32_t>(a), true, bit_cast<int32_t>(b), c, false);
+#else
+  const vector_type<int8_t, 4> a_vector{a};
+  const vector_type<int8_t, 4> b_vector{b};
+
+  static_for<0, 4, 1>{}([&](auto i) {
+    c += type_convert<int32_t>(a_vector.AsType<int8_t>()[i]) *
+        type_convert<int32_t>(b_vector.AsType<int8_t>()[i]);
+  });
+#endif
+}
+
+template <>
+__device__ void inner_product<int8x8_t, int8x8_t, int32_t>(
+    const int8x8_t& a,
+    const int8x8_t& b,
+    int32_t& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+
+  inner_product(
+      vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+      vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+      vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+      c);
+}
+
+template <>
+__device__ void inner_product<int8x16_t, int8x16_t, int32_t>(
+    const int8x16_t& a,
+    const int8x16_t& b,
+    int32_t& c) {
+  constexpr auto I0 = Number<0>{};
+  constexpr auto I1 = Number<1>{};
+  constexpr auto I2 = Number<2>{};
+  constexpr auto I3 = Number<3>{};
+
+  inner_product(
+      vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+      vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+      c);
+
+  inner_product(
+      vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+      vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+      c);
+
+  inner_product(
+      vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+      vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+      c);
+
+  inner_product(
+      vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+      vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+      c);
+}
+
+} // namespace ck
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_math_ext.h b/xformers/csrc/attention/hip_fmha/ck_attention_math_ext.h
new file mode 100644
index 000000000..2695a127f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_math_ext.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <ck/utility/data_type.hpp>
+#include <ck/utility/type_convert.hpp>
+
+namespace ck {
+namespace math {
+template <typename T>
+inline __device__ T exp(T x) {
+  return ck::type_convert<T>(__expf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float exp<float>(float x) {
+  return __expf(x);
+};
+
+template <>
+inline __device__ double exp<double>(double x) {
+  return exp(x);
+};
+} // namespace math
+} // namespace ck
diff --git a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
index a6ea50d78..b782f96ee 100644
--- a/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
+++ b/xformers/csrc/attention/hip_fmha/ck_fmha_util.h
@@ -13,10 +13,6 @@
 
 #include <torch/torch.h>
 
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
-#include <ck/utility/sequence.hpp>
-
 #define XFORMERS_CHECK(COND, ERR)          \
   if (!(COND)) {                           \
     std::ostringstream ostr;               \
@@ -24,50 +20,6 @@
     throw std::runtime_error(ostr.str());  \
   }
 
-#define DISPATCH_TYPES(InDataType, func)                                 \
-  {                                                                      \
-    if (InDataType == at::ScalarType::Half) {                            \
-      using scalar_t = ck::half_t;                                       \
-      func();                                                            \
-    } else if (InDataType == at::ScalarType::BFloat16) {                 \
-      using scalar_t = ck::bhalf_t;                                      \
-      func();                                                            \
-    } else {                                                             \
-      XFORMERS_CHECK(                                                    \
-          false, "Only half & bf16 input type supported at the moment"); \
-    }                                                                    \
-  }
-
-template <typename scalar_t>
-struct CkToAtenDtype;
-
-template <>
-struct CkToAtenDtype<ck::half_t> {
-  using scalar_t = ck::half_t;
-
-  static constexpr __host__ at::ScalarType atScalarType() {
-    return at::ScalarType::Half;
-  }
-};
-
-template <>
-struct CkToAtenDtype<ck::bhalf_t> {
-  using scalar_t = ck::bhalf_t;
-
-  static constexpr __host__ at::ScalarType atScalarType() {
-    return at::ScalarType::BFloat16;
-  }
-};
-
-template <>
-struct CkToAtenDtype<float> {
-  using scalar_t = float;
-
-  static constexpr __host__ at::ScalarType atScalarType() {
-    return at::ScalarType::Float;
-  }
-};
-
 #define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
   XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
   XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h
index c07559a3c..a2bf752d8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_bool_switch.h
@@ -4,6 +4,73 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+
 #pragma once
 
-#include <ck/utility/static_switch.hpp>
+#define BOOL_SWITCH(COND1, CONST_NAME1, ...) \
+  [&] {                                      \
+    if (COND1) {                             \
+      constexpr bool CONST_NAME1 = true;     \
+      __VA_ARGS__();                         \
+    } else {                                 \
+      constexpr bool CONST_NAME1 = false;    \
+      __VA_ARGS__();                         \
+    }                                        \
+  }()
+
+#define BOOL_SWITCH_2(COND1, CONST_NAME1, COND2, CONST_NAME2, ...) \
+  [&] {                                                            \
+    if (COND1) {                                                   \
+      constexpr bool CONST_NAME1 = true;                           \
+      BOOL_SWITCH(COND2, CONST_NAME2, ##__VA_ARGS__);              \
+    } else {                                                       \
+      constexpr bool CONST_NAME1 = false;                          \
+      BOOL_SWITCH(COND2, CONST_NAME2, ##__VA_ARGS__);              \
+    }                                                              \
+  }()
+
+#define BOOL_SWITCH_3(                                                      \
+    COND1, CONST_NAME1, COND2, CONST_NAME2, COND3, CONST_NAME3, ...)        \
+  [&] {                                                                     \
+    if (COND1) {                                                            \
+      constexpr bool CONST_NAME1 = true;                                    \
+      BOOL_SWITCH_2(COND2, CONST_NAME2, COND3, CONST_NAME3, ##__VA_ARGS__); \
+    } else {                                                                \
+      constexpr bool CONST_NAME1 = false;                                   \
+      BOOL_SWITCH_2(COND2, CONST_NAME2, COND3, CONST_NAME3, ##__VA_ARGS__); \
+    }                                                                       \
+  }()
+
+#define BOOL_SWITCH_4(                    \
+    COND1,                                \
+    CONST_NAME1,                          \
+    COND2,                                \
+    CONST_NAME2,                          \
+    COND3,                                \
+    CONST_NAME3,                          \
+    COND4,                                \
+    CONST_NAME4,                          \
+    ...)                                  \
+  [&] {                                   \
+    if (COND1) {                          \
+      constexpr bool CONST_NAME1 = true;  \
+      BOOL_SWITCH_3(                      \
+          COND2,                          \
+          CONST_NAME2,                    \
+          COND3,                          \
+          CONST_NAME3,                    \
+          COND4,                          \
+          CONST_NAME4,                    \
+          ##__VA_ARGS__);                 \
+    } else {                              \
+      constexpr bool CONST_NAME1 = false; \
+      BOOL_SWITCH_3(                      \
+          COND2,                          \
+          CONST_NAME2,                    \
+          COND3,                          \
+          CONST_NAME3,                    \
+          COND4,                          \
+          CONST_NAME4,                    \
+          ##__VA_ARGS__);                 \
+    }                                     \
+  }()
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 4c979ecc2..4a535aa5a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -6,66 +6,48 @@
  */
 #pragma once
 
-#include <ck/host_utility/device_prop.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tensor_description/cluster_descriptor.hpp>
-#include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dot_do_o.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_problem.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
-#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck_tile/core/numeric/integer.hpp>
+#include <ck_tile/host.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_bwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_bwd_epilogue.hpp"
-#include "fmha_bwd_kernel.hpp"
-#include "fmha_bwd_tile_partitioner.hpp"
-
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasBiasGrad,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 struct batched_backward_causalmask_bias_dropout_dispatch {
-  using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaBwdPipelineProblemTemp = ck_tile::BlockFmhaBwdPipelineProblem<
+      typename FmhaBwdTypeConfig<ScalarType>::QDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::KDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::VDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::GemmDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::LSEDataType,
       typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::DDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::BiasDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::RandValOutputDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+      typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
-      typename FmhaBwdTypeConfig<ScalarType>::VGradDataType>>;
-
-  template <typename FmhaTraits, typename FmhaMask>
-  using FmhaBwdPipelineProblemTemp =
-      ck::tile_program::block::BlockFmhaBwdPipelineProblem<
-          typename FmhaBwdTypeConfig<ScalarType>::QDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::KDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::VDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::GemmDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::LSEDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::DDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::BiasDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::RandValOutputDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::ODataType,
-          typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
-          FmhaBwdShape<MaxK>,
-          false, // kIsGroupMode
-          FmhaMask,
-          FmhaTraits>;
+      typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
+      FmhaBwdShape<MaxK>,
+      false, // kIsGroupMode
+      FmhaMask,
+      FmhaTraits>;
 
   static void Run(BatchedBackwardParams& param, hipStream_t stream) {
     {
-      constexpr ck::index_t kBlockSize = 256;
+      constexpr ck_tile::index_t kBlockSize = 256;
 
       const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
       const bool pad_headdim_v =
@@ -73,16 +55,15 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
 
       BOOL_SWITCH_2(
           pad_seqlen_q, kPadSeqLenQ, pad_headdim_v, kPadHeadDimV, [&] {
-            constexpr ck::index_t occupancy = 2;
+            constexpr ck_tile::index_t occupancy = 2;
 
-            using FmhaOGradDotOTraits_ =
-                ck::tile_program::TileFmhaBwdOGradDotOTraits<
-                    kPadSeqLenQ,
-                    kPadHeadDimV,
-                    occupancy>;
+            using FmhaOGradDotOTraits_ = ck_tile::TileFmhaBwdOGradDotOTraits<
+                kPadSeqLenQ,
+                kPadHeadDimV,
+                occupancy>;
 
             using FmhaBwdOGradDotOPipelineProblem =
-                ck::tile_program::block::BlockFmhaBwdOGradDotOPipelineProblem<
+                ck_tile::BlockFmhaBwdOGradDotOPipelineProblem<
                     typename FmhaBwdTypeConfig<ScalarType>::ODataType,
                     typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
                     typename FmhaBwdTypeConfig<ScalarType>::DDataType,
@@ -92,11 +73,11 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
                     FmhaOGradDotOTraits_>;
 
             using FmhaBwdOGradDotOPipeline =
-                typename ck::tile_program::block::BlockFmhaBwdOGradDotO<
+                typename ck_tile::BlockFmhaBwdOGradDotO<
                     FmhaBwdOGradDotOPipelineProblem>;
 
-            using FmhaBwdOGradDotOKernel_ = FmhaBwdOGradDotOKernel<
-                FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
+            using FmhaBwdOGradDotOKernel_ = ck_tile::FmhaBwdOGradDotOKernel<
+                ck_tile::FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
                 FmhaBwdOGradDotOPipeline>;
 
             RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
@@ -107,15 +88,18 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
       const bool has_local_attention = (param.window_size > 0) ? true : false;
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-        constexpr ck::index_t occupancy = 1;
+        constexpr ck_tile::index_t occupancy = 1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
-        using FmhaMask =
-            ck::tile_program::block::SimplifiedGenericAttentionMask<
-                has_masking>;
+        using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
         using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
-        using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
+        using FmhaBwdTilePartitioner_ =
+            ck_tile::FmhaBwdTilePartitioner<FmhaBwdShape_>;
+
+        constexpr auto kBiasEnum = kHasBias
+            ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
+            : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
 
         constexpr bool kPadSeqLenQ = true;
         constexpr bool kPadSeqLenK = true;
@@ -128,15 +112,16 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
         BOOL_SWITCH(pad_headdim, kPadHeadDim, [&] {
-          using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
+          using FmhaBwdTraits_ = ck_tile::TileFmhaTraits<
               kPadSeqLenQ,
               kPadSeqLenK,
               kPadHeadDim, // kPadHeadDimQ,
               kPadHeadDim, // kPadHeadDimV,
-              kHasBias,
+              kBiasEnum,
               kHasBiasGrad,
               false, // kStoreLSE
               kHasDropout,
+              false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
           using FmhaBwdPipelineProblem =
@@ -149,10 +134,25 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
               FmhaBwdPipelineEnum_,
               FmhaBwdPipelineProblem>::pipeline;
 
-          using FmhaBwdDQDKDVKernel_ = FmhaBwdDQDKDVKernel<
+          using FmhaBwdKGradEpilogue_ =
+              ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                  typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                  typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
+                  kPadSeqLenK,
+                  kPadHeadDim>>;
+
+          using FmhaBwdVGradEpilogue_ =
+              ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                  typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                  typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
+                  kPadSeqLenK,
+                  kPadHeadDim>>;
+
+          using FmhaBwdDQDKDVKernel_ = ck_tile::FmhaBwdDQDKDVKernel<
               FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
-              FmhaBwdEpilogue_>;
+              FmhaBwdKGradEpilogue_,
+              FmhaBwdVGradEpilogue_>;
 
           RunWithBwdDQDKDVKernel<FmhaBwdDQDKDVKernel_>(param, stream);
         });
@@ -185,15 +185,13 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
     dim3 kGridSize =
         FmhaBwdOGradDotOKernel::GridSize(param.B, param.Hq, param.M);
     constexpr dim3 kBlockSize = FmhaBwdOGradDotOKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdOGradDotOKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaBwdOGradDotOKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu =
+        FmhaBwdOGradDotOKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaBwdOGradDotOKernel{}, kGridSize, kBlockSize, 0, kargs));
   }
 
   template <typename FmhaBwdDQDKDVKernel>
@@ -265,15 +263,12 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
 
     dim3 kGridSize = FmhaBwdDQDKDVKernel::GridSize(param.B, param.Hq, param.N);
     constexpr dim3 kBlockSize = FmhaBwdDQDKDVKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdDQDKDVKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaBwdDQDKDVKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu = FmhaBwdDQDKDVKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaBwdDQDKDVKernel{}, kGridSize, kBlockSize, 0, kargs));
   }
 };
 
@@ -283,7 +278,7 @@ template <
     bool kHasBias,
     bool kHasBiasGrad,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 void run_batched_backward_causalmask_bias_dropout_dispatch(
     BatchedBackwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
similarity index 71%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
index 87f4ad107..a9e17ee73 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,86 +12,86 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
+void batched_backward_bf16(BatchedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
   BOOL_SWITCH_3(
       param.has_attn_bias,
@@ -106,7 +105,7 @@ void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_batched_backward_causalmask_bias_dropout_dispatch<
-                  ck::bhalf_t,
+                  ck_tile::bf16_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
@@ -114,7 +113,7 @@ void batched_backward_bp16(BatchedBackwardParams& param, hipStream_t stream) {
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_batched_backward_causalmask_bias_dropout_dispatch<
-                  ck::bhalf_t,
+                  ck_tile::bf16_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index ed39b5a89..17c4aa9d3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,82 +12,82 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 32>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 32>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 64>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 64>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 128>(
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -106,7 +105,7 @@ void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_batched_backward_causalmask_bias_dropout_dispatch<
-                  ck::half_t,
+                  ck_tile::fp16_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
@@ -114,7 +113,7 @@ void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_batched_backward_causalmask_bias_dropout_dispatch<
-                  ck::half_t,
+                  ck_tile::fp16_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 501f0c675..20c1b2c3e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -6,53 +6,39 @@
  */
 #pragma once
 
-#include <ck/host_utility/device_prop.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tensor_description/cluster_descriptor.hpp>
-#include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
-#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck_tile/core/numeric/integer.hpp>
+#include <ck_tile/host.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_fwd_epilogue.hpp"
-#include "fmha_fwd_kernel.hpp"
-#include "fmha_fwd_tile_partitioner.hpp"
-
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 struct batched_forward_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
-  using FmhaPipelineProblemTemp =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-          FmhaFwdShape<MaxK>,
-          false, // kIsGroupMode
-          FmhaMask,
-          FmhaTraits>;
+  using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
+      typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+      FmhaFwdShape<MaxK>,
+      false, // kIsGroupMode
+      FmhaMask,
+      FmhaTraits>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
@@ -60,14 +46,18 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
-      using FmhaMask =
-          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
+      using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
-      using FmhaFwdTilePartitioner_ = FmhaFwdTilePartitioner<FmhaFwdShape_>;
-      constexpr ck::index_t occupancy =
+      using FmhaFwdTilePartitioner_ =
+          ck_tile::FmhaFwdTilePartitioner<FmhaFwdShape_>;
+      constexpr ck_tile::index_t occupancy =
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
+      constexpr auto kBiasEnum = kHasBias
+          ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
+          : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
+
       const bool pad_seqlen_q = !(param.M % FmhaFwdShape_::kM0 == 0);
       const bool pad_seqlen_k =
           (param.N == 0) || !(param.N % FmhaFwdShape_::kN0 == 0);
@@ -82,7 +72,6 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
       const bool use_async_pipeline =
           ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
-      /*      if (!use_async_pipeline) { */
       BOOL_SWITCH_3(
           pad_seqlen_q,
           kPadSeqLenQ,
@@ -91,69 +80,38 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
           pad_headdim,
           kPadHeadDim,
           [&] {
-            using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
+            using FmhaFwdTraits_ = ck_tile::TileFmhaTraits<
                 kPadSeqLenQ,
                 kPadSeqLenK,
                 kPadHeadDim, // kPadHeadDimQ
                 kPadHeadDim, // kPadHeadDimV
-                kHasBias,
+                kBiasEnum,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
                 kHasDropout,
+                false, // kDoFp8StaticQuant place-holder
                 occupancy>;
 
             using FmhaPipelineProblem =
                 FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
 
             using FmhaFwdPipeline_ =
-                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                    FmhaPipelineProblem>;
+                ck_tile::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
 
-            using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                kPadSeqLenQ,
-                kPadHeadDim>>;
+            using FmhaFwdEpilogue_ =
+                ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                    typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                    typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                    kPadSeqLenQ,
+                    kPadHeadDim>>;
 
-            using FmhaFwdKernel_ = FmhaFwdKernel<
+            using FmhaFwdKernel_ = ck_tile::FmhaFwdKernel<
                 FmhaFwdTilePartitioner_,
                 FmhaFwdPipeline_,
                 FmhaFwdEpilogue_>;
 
             RunWithKernel<FmhaFwdKernel_>(param, stream);
           });
-      /*
-            } else {
-              BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK,
-         [&] { using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits< true, //
-         kPadSeqLenQ, kPadSeqLenK, true, // kPadHeadDimQ true, // kPadHeadDimV
-                    kHasBias,
-                    true, // kStoreLSE
-                    kHasDropout,
-                    occupancy>;
-
-                using FmhaPipelineProblem =
-                    FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
-
-                using FmhaFwdPipeline_ =
-                    ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                        FmhaPipelineProblem>;
-
-                using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                    typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                    typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                    true,
-                    true>>;
-
-                using FmhaFwdKernel_ = FmhaFwdKernel<
-                    FmhaFwdTilePartitioner_,
-                    FmhaFwdPipeline_,
-                    FmhaFwdEpilogue_>;
-
-                RunWithKernel<FmhaFwdKernel_>(param, stream);
-              });
-            };
-      */
     });
   };
 
@@ -175,6 +133,8 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
           param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
+          1.0f, // scale_p
+          1.0f, // scale_o
           param.q_strides[1], // q, k, v, bias, randval, out tensor seq-dim
                               // stride
           param.k_strides[1],
@@ -187,7 +147,7 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
           param.k_strides[2],
           param.v_strides[2],
           param.attn_bias_strides[1],
-          0, // nhead_randval
+          0, // nhead_randva
           param.lse_strides[1], // nhead_stride_lse
           param.out_strides[2],
           param.q_strides[0], // q, k, v, bias, randval, lse, out tensor
@@ -202,8 +162,6 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
-          1.0f, // descale_qk, not used
-          1.0f, // descale_sv, not used
           param.dropout_prob, // dropout ratio
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
@@ -212,15 +170,12 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
     dim3 kGridSize =
         FmhaFwdKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
     constexpr dim3 kBlockSize = FmhaFwdKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaFwdKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaFwdKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu = FmhaFwdKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaFwdKernel{}, kGridSize, kBlockSize, 0, kargs));
   };
 };
 
@@ -229,7 +184,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 void run_batched_forward_causalmask_bias_dropout_dispatch(
     BatchedForwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
similarity index 73%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
index 80ba53eb4..e27552d3e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,93 +12,93 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_forward_bp16(BatchedForwardParams& param, hipStream_t stream) {
+void batched_forward_bf16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
   BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_forward_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_forward_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 450a70de2..a65f6a2a2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,76 +12,76 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -92,14 +91,14 @@ void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_forward_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_forward_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index acd967f14..05d654dc3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -6,54 +6,40 @@
  */
 #pragma once
 
-#include <ck/host_utility/device_prop.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tensor_description/cluster_descriptor.hpp>
-#include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
-#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck_tile/core/numeric/integer.hpp>
+#include <ck_tile/host.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "fmha_fwd_epilogue.hpp"
-#include "fmha_fwd_kernel.hpp"
-#include "fmha_fwd_tile_partitioner.hpp"
-
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 struct batched_infer_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
-  using FmhaPipelineProblemTemp =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-          FmhaFwdShape<MaxK>,
-          false, // kIsGroupMode
-          FmhaMask,
-          FmhaTraits>;
+  using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
+      typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+      FmhaFwdShape<MaxK>,
+      false, // kIsGroupMode
+      FmhaMask,
+      FmhaTraits>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
@@ -61,14 +47,17 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
-      using FmhaMask =
-          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
+      using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaShape = FmhaFwdShape<MaxK>;
-      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-      constexpr ck::index_t occupancy =
+      using FmhaTilePartitioner = ck_tile::FmhaFwdTilePartitioner<FmhaShape>;
+      constexpr ck_tile::index_t occupancy =
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
+      constexpr auto kBiasEnum = kHasBias
+          ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
+          : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
+
       const bool pad_seqlen_q = !(param.M % FmhaShape::kM0 == 0);
       const bool pad_seqlen_k =
           (param.N == 0) || !(param.N % FmhaShape::kN0 == 0);
@@ -91,31 +80,32 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
             pad_headdim,
             kPadHeadDim,
             [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+              using FmhaTraits = ck_tile::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
                   kPadHeadDim, // kPadHeadDimQ,
                   kPadHeadDim, // kPadHeadDimV,
-                  kHasBias,
+                  kBiasEnum,
                   false, // kHasBiasGrad place-holder
                   false, // kStoreLSE
                   kHasDropout,
+                  false, // kDoFp8StaticQuant place-holder
                   occupancy>;
 
               using FmhaPipelineProblem =
                   FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
               using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
+                  ck_tile::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
 
-              using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                  typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                  typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                  kPadSeqLenQ,
-                  kPadHeadDim>>;
+              using FmhaEpilogue =
+                  ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                      typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                      typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                      kPadSeqLenQ,
+                      kPadHeadDim>>;
 
-              using FmhaKernel = FmhaFwdKernel<
+              using FmhaKernel = ck_tile::FmhaFwdKernel<
                   FmhaTilePartitioner,
                   FmhaPipeline,
                   FmhaEpilogue>;
@@ -124,31 +114,32 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
             });
       } else {
         BOOL_SWITCH(pad_seqlen_k, kPadSeqLenK, [&] {
-          using FmhaTraits = ck::tile_program::TileFmhaTraits<
+          using FmhaTraits = ck_tile::TileFmhaTraits<
               true, // kPadSeqLenQ,
               kPadSeqLenK,
               true, // kPadHeadDimQ,
               true, // kPadHeadDimV,
-              kHasBias,
+              kBiasEnum,
               false, // kHasBiasGrad place-holder
               false, // kStoreLSE
               kHasDropout,
+              false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
           using FmhaPipelineProblem =
               FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
           using FmhaPipeline =
-              ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                  FmhaPipelineProblem>;
+              ck_tile::BlockFmhaPipelineQRKSVSAsync<FmhaPipelineProblem>;
 
-          using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-              typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-              typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-              true,
-              true>>;
+          using FmhaEpilogue =
+              ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                  typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                  typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                  true,
+                  true>>;
 
-          using FmhaKernel =
+          using FmhaKernel = ck_tile::
               FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
           RunWithKernel<FmhaKernel>(param, stream);
@@ -175,6 +166,8 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
           param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
+          1.0f, // scale_p
+          1.0f, // scale_o
           param.q_strides[1], // q, k, v, bias, randval, out tensor seq-dim
                               // stride
           param.k_strides[1],
@@ -202,8 +195,6 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
-          1.0f, // descale_qk, not used
-          1.0f, // descale_sv, not used
           param.dropout_prob, // dropout ratio
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
@@ -211,15 +202,12 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
 
     dim3 kGridSize = FmhaKernel::GridSize(param.B, param.Hq, param.M, param.Kv);
     constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaKernel{}, kGridSize, kBlockSize, 0, kargs));
   };
 };
 
@@ -228,7 +216,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 void run_batched_infer_causalmask_bias_dropout_dispatch(
     BatchedForwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
similarity index 73%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
index cf7bacbe4..b362a780f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
@@ -4,101 +4,100 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
 // clang-format off
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void batched_infer_bp16(BatchedForwardParams& param, hipStream_t stream) {
+void batched_infer_bf16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
   BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_infer_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_infer_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index 533b86109..e55003c60 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -4,84 +4,83 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
 // clang-format off
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
     BatchedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -91,14 +90,14 @@ void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_batched_infer_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_batched_infer_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 910b25f8f..4ef24248a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -6,87 +6,84 @@
  */
 #pragma once
 
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_kts_vr.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dq_dk_dv_pipeline_ks_vr.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dq_dk_dv_pipeline_qs_ks_vr_dos.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_enum.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck_tile/core.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 template <typename DataType>
 struct FmhaBwdTypeConfig;
 
 template <>
-struct FmhaBwdTypeConfig<ck::half_t> {
-  using QDataType = ck::half_t;
-  using KDataType = ck::half_t;
-  using VDataType = ck::half_t;
-  using GemmDataType = ck::half_t;
-  using BiasDataType = ck::half_t;
+struct FmhaBwdTypeConfig<ck_tile::fp16_t> {
+  using QDataType = ck_tile::fp16_t;
+  using KDataType = ck_tile::fp16_t;
+  using VDataType = ck_tile::fp16_t;
+  using GemmDataType = ck_tile::fp16_t;
+  using BiasDataType = ck_tile::fp16_t;
   using RandValOutputDataType = unsigned short;
   using LSEDataType = float;
   using AccDataType = float; // data type for gemm accumulation
   using DDataType = float;
-  using ODataType = ck::half_t;
-  using OGradDataType = ck::half_t;
-  using QGradDataType = ck::half_t;
-  using KGradDataType = ck::half_t;
-  using VGradDataType = ck::half_t;
-  using BiasGradDataType = ck::half_t;
+  using ODataType = ck_tile::fp16_t;
+  using OGradDataType = ck_tile::fp16_t;
+  using QGradDataType = ck_tile::fp16_t;
+  using KGradDataType = ck_tile::fp16_t;
+  using VGradDataType = ck_tile::fp16_t;
+  using BiasGradDataType = ck_tile::fp16_t;
 };
 
 template <>
-struct FmhaBwdTypeConfig<ck::bhalf_t> {
-  using QDataType = ck::bhalf_t;
-  using KDataType = ck::bhalf_t;
-  using VDataType = ck::bhalf_t;
-  using GemmDataType = ck::bhalf_t;
-  using BiasDataType = ck::bhalf_t;
+struct FmhaBwdTypeConfig<ck_tile::bf16_t> {
+  using QDataType = ck_tile::bf16_t;
+  using KDataType = ck_tile::bf16_t;
+  using VDataType = ck_tile::bf16_t;
+  using GemmDataType = ck_tile::bf16_t;
+  using BiasDataType = ck_tile::bf16_t;
   using RandValOutputDataType = unsigned short;
   using LSEDataType = float;
   using AccDataType = float; // data type for gemm accumulation
   using DDataType = float;
-  using ODataType = ck::bhalf_t;
-  using OGradDataType = ck::bhalf_t;
-  using QGradDataType = ck::bhalf_t;
-  using KGradDataType = ck::bhalf_t;
-  using VGradDataType = ck::bhalf_t;
-  using BiasGradDataType = ck::bhalf_t;
+  using ODataType = ck_tile::bf16_t;
+  using OGradDataType = ck_tile::bf16_t;
+  using QGradDataType = ck_tile::bf16_t;
+  using KGradDataType = ck_tile::bf16_t;
+  using VGradDataType = ck_tile::bf16_t;
+  using BiasGradDataType = ck_tile::bf16_t;
 };
 
-template <ck::index_t MaxK>
+template <ck_tile::index_t MaxK>
 struct FmhaBwdBlockTile;
 
 template <>
 struct FmhaBwdBlockTile<32> {
-  using type = ck::Sequence<128, 128, 32, 32, 32, 32, 32, 32, 32>;
-  using gemm02_warps = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
-  using gemm13_warps = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck::Sequence<4, 1, 1>; // default for gemm4
+  using type = ck_tile::sequence<128, 128, 32, 32, 32, 32, 32, 32, 32>;
+  using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck_tile::sequence<4, 1, 1>; // default for gemm4
 };
 
 template <>
 struct FmhaBwdBlockTile<64> {
-  using type = ck::Sequence<64, 128, 32, 32, 32, 32, 32, 64, 64>;
-  using gemm02_warps = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
-  using gemm13_warps = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck::Sequence<2, 2, 1>; // default for gemm4
+  using type = ck_tile::sequence<64, 128, 32, 32, 32, 32, 32, 64, 64>;
+  using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck_tile::sequence<2, 2, 1>; // default for gemm4
 };
 
 template <>
 struct FmhaBwdBlockTile<128> {
-  using type = ck::Sequence<64, 128, 32, 32, 32, 32, 32, 128, 128>;
-  using gemm02_warps = ck::Sequence<1, 4, 1>; // default for gemm0/gemm2
-  using gemm13_warps = ck::Sequence<4, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck::Sequence<2, 2, 1>; // default for gemm4
+  using type = ck_tile::sequence<64, 128, 32, 32, 32, 32, 32, 128, 128>;
+  using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck_tile::sequence<2, 2, 1>; // default for gemm4
 };
 
-using FmhaBwdWarpTile = ck::Sequence<32, 32, 16>;
+using FmhaBwdWarpTile = ck_tile::sequence<32, 32, 16>;
 
-template <ck::index_t MaxK>
+template <ck_tile::index_t MaxK>
 struct FmhaBwdShape;
 
 template <>
-struct FmhaBwdShape<32> : ck::tile_program::TileFmhaBwdShape<
+struct FmhaBwdShape<32> : ck_tile::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<32>::type,
                               typename FmhaBwdBlockTile<32>::gemm02_warps,
                               FmhaBwdWarpTile,
@@ -100,7 +97,7 @@ struct FmhaBwdShape<32> : ck::tile_program::TileFmhaBwdShape<
                               FmhaBwdWarpTile> {};
 
 template <>
-struct FmhaBwdShape<64> : ck::tile_program::TileFmhaBwdShape<
+struct FmhaBwdShape<64> : ck_tile::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<64>::type,
                               typename FmhaBwdBlockTile<64>::gemm02_warps,
                               FmhaBwdWarpTile,
@@ -114,7 +111,7 @@ struct FmhaBwdShape<64> : ck::tile_program::TileFmhaBwdShape<
                               FmhaBwdWarpTile> {};
 
 template <>
-struct FmhaBwdShape<128> : ck::tile_program::TileFmhaBwdShape<
+struct FmhaBwdShape<128> : ck_tile::TileFmhaBwdShape<
                                typename FmhaBwdBlockTile<128>::type,
                                typename FmhaBwdBlockTile<128>::gemm02_warps,
                                FmhaBwdWarpTile,
@@ -127,46 +124,45 @@ struct FmhaBwdShape<128> : ck::tile_program::TileFmhaBwdShape<
                                typename FmhaBwdBlockTile<128>::gemm4_warps,
                                FmhaBwdWarpTile> {};
 
-template <ck::index_t MaxK>
+template <ck_tile::index_t MaxK>
 struct FmhaBwdPipelineEnumSelector;
 
 template <>
 struct FmhaBwdPipelineEnumSelector<32> {
-  static constexpr ck::BlockFmhaBwdPipelineEnum value =
-      ck::BlockFmhaBwdPipelineEnum::QSKSVROGradS;
+  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
+      ck_tile::BlockFmhaBwdPipelineEnum::QSKSVROGradS;
 };
 
 template <>
 struct FmhaBwdPipelineEnumSelector<64> {
-  static constexpr ck::BlockFmhaBwdPipelineEnum value =
-      ck::BlockFmhaBwdPipelineEnum::KSKTSVR;
+  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
+      ck_tile::BlockFmhaBwdPipelineEnum::KSKTSVR;
 };
 
 template <>
 struct FmhaBwdPipelineEnumSelector<128> {
-  static constexpr ck::BlockFmhaBwdPipelineEnum value =
-      ck::BlockFmhaBwdPipelineEnum::KSVR;
+  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
+      ck_tile::BlockFmhaBwdPipelineEnum::KSVR;
 };
 
-template <ck::BlockFmhaBwdPipelineEnum value, typename problem>
+template <ck_tile::BlockFmhaBwdPipelineEnum value, typename problem>
 struct FmhaBwdPipelineMaker;
 
 template <typename problem>
 struct FmhaBwdPipelineMaker<
-    ck::BlockFmhaBwdPipelineEnum::QSKSVROGradS,
+    ck_tile::BlockFmhaBwdPipelineEnum::QSKSVROGradS,
     problem> {
-  using pipeline =
-      ck::tile_program::block::BlockFmhaBwdDQDKDVPipelineQSKSVROGradS<problem>;
+  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineQSKSVROGradS<problem>;
 };
 
 template <typename problem>
-struct FmhaBwdPipelineMaker<ck::BlockFmhaBwdPipelineEnum::KSKTSVR, problem> {
-  using pipeline =
-      ck::tile_program::block::BlockFmhaBwdDQDKDVPipelineKSKTSVR<problem>;
+struct FmhaBwdPipelineMaker<
+    ck_tile::BlockFmhaBwdPipelineEnum::KSKTSVR,
+    problem> {
+  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKSKTSVR<problem>;
 };
 
 template <typename problem>
-struct FmhaBwdPipelineMaker<ck::BlockFmhaBwdPipelineEnum::KSVR, problem> {
-  using pipeline =
-      ck::tile_program::block::BlockFmhaBwdDQDKDVPipelineKSVR<problem>;
+struct FmhaBwdPipelineMaker<ck_tile::BlockFmhaBwdPipelineEnum::KSVR, problem> {
+  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKSVR<problem>;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
index 364226ebe..662703b7e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
@@ -6,83 +6,84 @@
  */
 #pragma once
 
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
+#include <ck_tile/core.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 template <typename DataType>
 struct FmhaFwdTypeConfig;
 
 template <>
-struct FmhaFwdTypeConfig<ck::half_t> {
-  using QDataType = ck::half_t;
-  using KDataType = ck::half_t;
-  using VDataType = ck::half_t;
-  using BiasDataType = ck::half_t;
+struct FmhaFwdTypeConfig<ck_tile::fp16_t> {
+  using QDataType = ck_tile::fp16_t;
+  using KDataType = ck_tile::fp16_t;
+  using VDataType = ck_tile::fp16_t;
+  using BiasDataType = ck_tile::fp16_t;
   using RandValOutputDataType = unsigned short;
   using LSEDataType =
       float; // data type for lse(logsumexp L_j = max_j + log(l_j))
   using SaccDataType = float; // data type for first gemm accumulation
   using SMPLComputeDataType = float; // data type for reduction, softmax
-  using PDataType = ck::half_t; // data type for A matrix of second gemm
+  using PDataType = ck_tile::fp16_t; // data type for A matrix of second gemm
   using OaccDataType = float; // data type for second gemm accumulation
-  using ODataType = ck::half_t;
+  using ODataType = ck_tile::fp16_t;
 };
 
 template <>
-struct FmhaFwdTypeConfig<ck::bhalf_t> {
-  using QDataType = ck::bhalf_t;
-  using KDataType = ck::bhalf_t;
-  using VDataType = ck::bhalf_t;
-  using BiasDataType = ck::bhalf_t;
+struct FmhaFwdTypeConfig<ck_tile::bf16_t> {
+  using QDataType = ck_tile::bf16_t;
+  using KDataType = ck_tile::bf16_t;
+  using VDataType = ck_tile::bf16_t;
+  using BiasDataType = ck_tile::bf16_t;
   using RandValOutputDataType = unsigned short;
   using LSEDataType =
       float; // data type for lse(logsumexp L_j = max_j + log(l_j))
   using SaccDataType = float; // data type for first gemm accumulation
   using SMPLComputeDataType = float; // data type for reduction, softmax
-  using PDataType = ck::bhalf_t; // data type for A matrix of second gemm
+  using PDataType = ck_tile::bf16_t; // data type for A matrix of second gemm
   using OaccDataType = float; // data type for second gemm accumulation
-  using ODataType = ck::bhalf_t;
+  using ODataType = ck_tile::bf16_t;
 };
 
-template <ck::index_t MaxK>
+template <ck_tile::index_t MaxK>
 struct FmhaFwdBlockTile;
 
 template <>
 struct FmhaFwdBlockTile<32> {
-  using type = ck::Sequence<128, 64, 16, 32, 32, 32>;
-  using gemm0_warps = ck::Sequence<2, 1, 1>;
-  using gemm1_warps = ck::Sequence<2, 1, 1>;
+  using type = ck_tile::sequence<128, 64, 16, 32, 32, 32>;
+  using gemm0_warps = ck_tile::sequence<2, 1, 1>;
+  using gemm1_warps = ck_tile::sequence<2, 1, 1>;
 };
 
 template <>
 struct FmhaFwdBlockTile<64> {
-  using type = ck::Sequence<128, 64, 32, 64, 32, 64>;
-  using gemm0_warps = ck::Sequence<4, 1, 1>;
-  using gemm1_warps = ck::Sequence<4, 1, 1>;
+  using type = ck_tile::sequence<128, 64, 32, 64, 32, 64>;
+  using gemm0_warps = ck_tile::sequence<4, 1, 1>;
+  using gemm1_warps = ck_tile::sequence<4, 1, 1>;
 };
 
 template <>
 struct FmhaFwdBlockTile<128> {
-  using type = ck::Sequence<128, 128, 32, 128, 32, 128>;
-  using gemm0_warps = ck::Sequence<4, 1, 1>;
-  using gemm1_warps = ck::Sequence<4, 1, 1>;
+  using type = ck_tile::sequence<128, 128, 32, 128, 32, 128>;
+  using gemm0_warps = ck_tile::sequence<4, 1, 1>;
+  using gemm1_warps = ck_tile::sequence<4, 1, 1>;
 };
 
 template <>
 struct FmhaFwdBlockTile<256> {
-  using type = ck::Sequence<128, 128, 32, 256, 32, 256>;
-  using gemm0_warps = ck::Sequence<4, 1, 1>;
-  using gemm1_warps = ck::Sequence<4, 1, 1>;
+  using type = ck_tile::sequence<128, 128, 32, 256, 32, 256>;
+  using gemm0_warps = ck_tile::sequence<4, 1, 1>;
+  using gemm1_warps = ck_tile::sequence<4, 1, 1>;
 };
 
-using FmhaFwdWarpTile = ck::Sequence<32, 32, 16>;
+using FmhaFwdWarpTile = ck_tile::sequence<32, 32, 16>;
 
 static constexpr bool IsVLayoutRowMajor = true;
 
-template <ck::index_t MaxK>
+template <ck_tile::index_t MaxK>
 struct FmhaFwdShape;
 
 template <>
-struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<
+struct FmhaFwdShape<32> : ck_tile::TileFmhaShape<
                               typename FmhaFwdBlockTile<32>::type,
                               typename FmhaFwdBlockTile<32>::gemm0_warps,
                               FmhaFwdWarpTile,
@@ -91,7 +92,7 @@ struct FmhaFwdShape<32> : ck::tile_program::TileFmhaShape<
                               IsVLayoutRowMajor> {};
 
 template <>
-struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<
+struct FmhaFwdShape<64> : ck_tile::TileFmhaShape<
                               typename FmhaFwdBlockTile<64>::type,
                               typename FmhaFwdBlockTile<64>::gemm0_warps,
                               FmhaFwdWarpTile,
@@ -100,7 +101,7 @@ struct FmhaFwdShape<64> : ck::tile_program::TileFmhaShape<
                               IsVLayoutRowMajor> {};
 
 template <>
-struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<
+struct FmhaFwdShape<128> : ck_tile::TileFmhaShape<
                                typename FmhaFwdBlockTile<128>::type,
                                typename FmhaFwdBlockTile<128>::gemm0_warps,
                                FmhaFwdWarpTile,
@@ -109,7 +110,7 @@ struct FmhaFwdShape<128> : ck::tile_program::TileFmhaShape<
                                IsVLayoutRowMajor> {};
 
 template <>
-struct FmhaFwdShape<256> : ck::tile_program::TileFmhaShape<
+struct FmhaFwdShape<256> : ck_tile::TileFmhaShape<
                                typename FmhaFwdBlockTile<256>::type,
                                typename FmhaFwdBlockTile<256>::gemm0_warps,
                                FmhaFwdWarpTile,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 881f07b52..b5038fdfe 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -6,81 +6,62 @@
  */
 #pragma once
 
-#include <ck/host_utility/device_prop.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tensor_description/cluster_descriptor.hpp>
-#include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_dot_do_o.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_bwd_pipeline_problem.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
-#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck_tile/core/numeric/integer.hpp>
+#include <ck_tile/host.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_bwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_bwd_epilogue.hpp"
-#include "fmha_bwd_kernel.hpp"
-#include "fmha_bwd_tile_partitioner.hpp"
-
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasBiasGrad,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 struct grouped_backward_causalmask_bias_dropout_dispatch {
-  using FmhaBwdEpilogue_ = FmhaBwdEpilogue<FmhaBwdEpilogueProblem<
+  template <typename FmhaTraits, typename FmhaMask>
+  using FmhaBwdPipelineProblemTemp = ck_tile::BlockFmhaBwdPipelineProblem<
+      typename FmhaBwdTypeConfig<ScalarType>::QDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::KDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::VDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::GemmDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::LSEDataType,
       typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::DDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::BiasDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::RandValOutputDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+      typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
       typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
-      typename FmhaBwdTypeConfig<ScalarType>::VGradDataType>>;
-
-  template <typename FmhaTraits, typename FmhaMask>
-  using FmhaBwdPipelineProblemTemp =
-      ck::tile_program::block::BlockFmhaBwdPipelineProblem<
-          typename FmhaBwdTypeConfig<ScalarType>::QDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::KDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::VDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::GemmDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::LSEDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::DDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::BiasDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::RandValOutputDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::ODataType,
-          typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
-          typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
-          FmhaBwdShape<MaxK>,
-          true, // kIsGroupMode
-          FmhaMask,
-          FmhaTraits>;
+      typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
+      typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
+      FmhaBwdShape<MaxK>,
+      true, // kIsGroupMode
+      FmhaMask,
+      FmhaTraits>;
 
   static void Run(GroupedBackwardParams& param, hipStream_t stream) {
     {
-      constexpr ck::index_t kBlockSize = 256;
+      constexpr ck_tile::index_t kBlockSize = 256;
       bool pad_seqlen_q = !(param.M % kBlockSize == 0);
       bool pad_headdim_v = !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
 
       BOOL_SWITCH_2(
           pad_seqlen_q, kPadSeqLenQ, pad_headdim_v, kPadHeadDimV, [&] {
-            constexpr ck::index_t occupancy = 2;
+            constexpr ck_tile::index_t occupancy = 2;
 
-            using FmhaOGradDotOTraits_ =
-                ck::tile_program::TileFmhaBwdOGradDotOTraits<
-                    kPadSeqLenQ,
-                    kPadHeadDimV,
-                    occupancy>;
+            using FmhaOGradDotOTraits_ = ck_tile::TileFmhaBwdOGradDotOTraits<
+                kPadSeqLenQ,
+                kPadHeadDimV,
+                occupancy>;
 
             using FmhaBwdOGradDotOPipelineProblem =
-                ck::tile_program::block::BlockFmhaBwdOGradDotOPipelineProblem<
+                ck_tile::BlockFmhaBwdOGradDotOPipelineProblem<
                     typename FmhaBwdTypeConfig<ScalarType>::ODataType,
                     typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
                     typename FmhaBwdTypeConfig<ScalarType>::DDataType,
@@ -90,11 +71,11 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
                     FmhaOGradDotOTraits_>;
 
             using FmhaBwdOGradDotOPipeline_ =
-                typename ck::tile_program::block::BlockFmhaBwdOGradDotO<
+                typename ck_tile::BlockFmhaBwdOGradDotO<
                     FmhaBwdOGradDotOPipelineProblem>;
 
-            using FmhaBwdOGradDotOKernel_ = FmhaBwdOGradDotOKernel<
-                FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
+            using FmhaBwdOGradDotOKernel_ = ck_tile::FmhaBwdOGradDotOKernel<
+                ck_tile::FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
                 FmhaBwdOGradDotOPipeline_>;
 
             RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
@@ -105,16 +86,19 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       const bool has_local_attention = (param.window_size > 0) ? true : false;
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-        constexpr ck::index_t occupancy = 1;
+        constexpr ck_tile::index_t occupancy = 1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 
-        using FmhaMask =
-            ck::tile_program::block::SimplifiedGenericAttentionMask<
-                has_masking>;
+        using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
         using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
-        using FmhaBwdTilePartitioner_ = FmhaBwdTilePartitioner<FmhaBwdShape_>;
+        using FmhaBwdTilePartitioner_ =
+            ck_tile::FmhaBwdTilePartitioner<FmhaBwdShape_>;
+
+        constexpr auto kBiasEnum = kHasBias
+            ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
+            : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
 
         constexpr bool kPadSeqLenQ = true;
         constexpr bool kPadSeqLenK = true;
@@ -127,15 +111,16 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
         const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
         BOOL_SWITCH(pad_headdim, kPadHeadDim, [&] {
-          using FmhaBwdTraits_ = ck::tile_program::TileFmhaTraits<
+          using FmhaBwdTraits_ = ck_tile::TileFmhaTraits<
               kPadSeqLenQ,
               kPadSeqLenK,
               kPadHeadDim, // kPadHeadDimQ,
               kPadHeadDim, // kPadHeadDimV,
-              kHasBias,
+              kBiasEnum,
               kHasBiasGrad,
               false, // kStoreLSE
               kHasDropout,
+              false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
           using FmhaBwdPipelineProblem =
@@ -148,10 +133,25 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
               FmhaBwdPipelineEnum_,
               FmhaBwdPipelineProblem>::pipeline;
 
-          using FmhaBwdDQDKDVKernel_ = FmhaBwdDQDKDVKernel<
+          using FmhaBwdKGradEpilogue_ =
+              ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                  typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                  typename FmhaBwdTypeConfig<ScalarType>::KGradDataType,
+                  kPadSeqLenK,
+                  kPadHeadDim>>;
+
+          using FmhaBwdVGradEpilogue_ =
+              ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                  typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                  typename FmhaBwdTypeConfig<ScalarType>::VGradDataType,
+                  kPadSeqLenK,
+                  kPadHeadDim>>;
+
+          using FmhaBwdDQDKDVKernel_ = ck_tile::FmhaBwdDQDKDVKernel<
               FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
-              FmhaBwdEpilogue_>;
+              FmhaBwdKGradEpilogue_,
+              FmhaBwdVGradEpilogue_>;
 
           RunWithBwdDQDKDVKernel<FmhaBwdDQDKDVKernel_>(param, stream);
         });
@@ -182,15 +182,13 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
     dim3 kGridSize = FmhaBwdOGradDotOKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_q);
     constexpr dim3 kBlockSize = FmhaBwdOGradDotOKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdOGradDotOKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaBwdOGradDotOKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu =
+        FmhaBwdOGradDotOKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaBwdOGradDotOKernel{}, kGridSize, kBlockSize, 0, kargs));
   }
 
   template <typename FmhaBwdDQDKDVKernel>
@@ -253,15 +251,12 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
     dim3 kGridSize = FmhaBwdDQDKDVKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_k);
     constexpr dim3 kBlockSize = FmhaBwdDQDKDVKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaBwdDQDKDVKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaBwdDQDKDVKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu = FmhaBwdDQDKDVKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaBwdDQDKDVKernel{}, kGridSize, kBlockSize, 0, kargs));
   }
 };
 
@@ -271,7 +266,7 @@ template <
     bool kHasBias,
     bool kHasBiasGrad,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 void run_grouped_backward_causalmask_bias_dropout_dispatch(
     GroupedBackwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
similarity index 71%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
index 6db554405..5d08a4d72 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,86 +12,86 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
+void grouped_backward_bf16(GroupedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
   BOOL_SWITCH_3(
       param.has_attn_bias,
@@ -106,7 +105,7 @@ void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_grouped_backward_causalmask_bias_dropout_dispatch<
-                  ck::bhalf_t,
+                  ck_tile::bf16_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
@@ -114,7 +113,7 @@ void grouped_backward_bp16(GroupedBackwardParams& param, hipStream_t stream) {
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_grouped_backward_causalmask_bias_dropout_dispatch<
-                  ck::bhalf_t,
+                  ck_tile::bf16_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index 3dfc6f7f1..266cd0ad1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,82 +12,82 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 32>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 32>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 64>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 64>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, true, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, false, 128>(
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -106,7 +105,7 @@ void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
           FMHA_BWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
             if (param.custom_mask_type == 0)
               run_grouped_backward_causalmask_bias_dropout_dispatch<
-                  ck::half_t,
+                  ck_tile::fp16_t,
                   false,
                   kHasBias,
                   kHasBiasGrad,
@@ -114,7 +113,7 @@ void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
                   MaxK>(param, stream);
             else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
               run_grouped_backward_causalmask_bias_dropout_dispatch<
-                  ck::half_t,
+                  ck_tile::fp16_t,
                   true,
                   kHasBias,
                   kHasBiasGrad,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 0b348bd0e..55609fd9f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -6,52 +6,39 @@
  */
 #pragma once
 
-#include <ck/host_utility/device_prop.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tensor_description/cluster_descriptor.hpp>
-#include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
-#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck_tile/core/numeric/integer.hpp>
+#include <ck_tile/host.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 
-#include "fmha_fwd_epilogue.hpp"
-#include "fmha_fwd_kernel.hpp"
-#include "fmha_fwd_tile_partitioner.hpp"
-
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 struct grouped_forward_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
-  using FmhaPipelineProblemTemp =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-          FmhaFwdShape<MaxK>,
-          true, // kIsGroupMode
-          FmhaMask,
-          FmhaTraits>;
+  using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
+      typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+      FmhaFwdShape<MaxK>,
+      true, // kIsGroupMode
+      FmhaMask,
+      FmhaTraits>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
@@ -59,14 +46,18 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
-      using FmhaMask =
-          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
+      using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
-      using FmhaFwdTilePartitioner_ = FmhaFwdTilePartitioner<FmhaFwdShape_>;
-      constexpr ck::index_t occupancy =
+      using FmhaFwdTilePartitioner_ =
+          ck_tile::FmhaFwdTilePartitioner<FmhaFwdShape_>;
+      constexpr ck_tile::index_t occupancy =
           (MaxK == 64) ? 3 : (MaxK == 256) ? 1 : 2;
 
+      constexpr auto kBiasEnum = kHasBias
+          ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
+          : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
+
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
 
@@ -76,31 +67,32 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
 
       BOOL_SWITCH_2(
           pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-            using FmhaFwdTraits_ = ck::tile_program::TileFmhaTraits<
+            using FmhaFwdTraits_ = ck_tile::TileFmhaTraits<
                 kPadSeqLenQ,
                 kPadSeqLenK,
                 kPadHeadDimQ,
                 kPadHeadDimV,
-                kHasBias,
+                kBiasEnum,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
                 kHasDropout,
+                false, // kDoFp8StaticQuant place-holder
                 occupancy>;
 
             using FmhaPipelineProblem =
                 FmhaPipelineProblemTemp<FmhaFwdTraits_, FmhaMask>;
 
             using FmhaFwdPipeline_ =
-                ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                    FmhaPipelineProblem>;
+                ck_tile::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
 
-            using FmhaFwdEpilogue_ = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                kPadSeqLenQ,
-                kPadHeadDimV>>;
+            using FmhaFwdEpilogue_ =
+                ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                    typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                    typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                    kPadSeqLenQ,
+                    kPadHeadDimV>>;
 
-            using FmhaFwdKernel_ = FmhaFwdKernel<
+            using FmhaFwdKernel_ = ck_tile::FmhaFwdKernel<
                 FmhaFwdTilePartitioner_,
                 FmhaFwdPipeline_,
                 FmhaFwdEpilogue_>;
@@ -129,6 +121,8 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
           param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
+          1.0f, // scale_p
+          1.0f, // scale_o
           param.q_strides[0], // q, k, v, bias, randval, out tensor seq-dim
                               // stride
           param.k_strides[0],
@@ -149,8 +143,6 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
-          1.0f, // descale_qk, not used
-          1.0f, // descale_sv, not used
           param.dropout_prob,
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
@@ -159,15 +151,12 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
     dim3 kGridSize = FmhaFwdKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
     constexpr dim3 kBlockSize = FmhaFwdKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaFwdKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaFwdKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu = FmhaFwdKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaFwdKernel{}, kGridSize, kBlockSize, 0, kargs));
   };
 };
 
@@ -176,7 +165,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 void run_grouped_forward_causalmask_bias_dropout_dispatch(
     GroupedForwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
similarity index 73%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
index f9d768c8c..e04af2e8a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,93 +12,93 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_forward_bp16(GroupedForwardParams& param, hipStream_t stream) {
+void grouped_forward_bf16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
   BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_forward_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_forward_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index abeba91f6..13276415e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -4,8 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
@@ -13,76 +12,76 @@
 #include "ck_tiled_headdim_switch.h"
 
 // clang-format off
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -92,14 +91,14 @@ void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_forward_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_forward_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index e26937576..f66eeb436 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -6,54 +6,40 @@
  */
 #pragma once
 
-#include <ck/host_utility/device_prop.hpp>
-#include <ck/host_utility/kernel_launch.hpp>
-#include <ck/tensor/tensor_view.hpp>
-#include <ck/tensor_description/cluster_descriptor.hpp>
-#include <ck/tensor_description/tensor_descriptor_helper.hpp>
-#include <ck/utility/common_header.hpp>
-
-#include <ck/tile_program/block_tile/block_masking.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_problem.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs.hpp>
-#include <ck/tile_program/block_tile_pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp>
-#include <ck/tile_program/tile/tile_fmha_shape.hpp>
-#include <ck/tile_program/tile/tile_fmha_traits.hpp>
+#include <ck_tile/core/numeric/integer.hpp>
+#include <ck_tile/host.hpp>
+#include <ck_tile/ops/epilogue.hpp>
+#include <ck_tile/ops/fmha.hpp>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_fwd_setting.h"
 #include "ck_tiled_fmha_params.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "fmha_fwd_epilogue.hpp"
-#include "fmha_fwd_kernel.hpp"
-#include "fmha_fwd_tile_partitioner.hpp"
-
 template <
     typename ScalarType,
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 struct grouped_infer_causalmask_bias_dropout_dispatch {
   template <typename FmhaTraits, typename FmhaMask>
-  using FmhaPipelineProblemTemp =
-      ck::tile_program::block::BlockFmhaPipelineProblem<
-          typename FmhaFwdTypeConfig<ScalarType>::QDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::KDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::VDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::PDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-          typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-          FmhaFwdShape<MaxK>,
-          true, // kIsGroupMode
-          FmhaMask,
-          FmhaTraits>;
+  using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
+      typename FmhaFwdTypeConfig<ScalarType>::QDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::KDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::VDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::SMPLComputeDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::BiasDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::RandValOutputDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::LSEDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::PDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+      typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+      FmhaFwdShape<MaxK>,
+      true, // kIsGroupMode
+      FmhaMask,
+      FmhaTraits>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
     const bool has_local_attention = (param.window_size > 0) ? true : false;
@@ -61,14 +47,17 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
     BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
       constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
-      using FmhaMask =
-          ck::tile_program::block::SimplifiedGenericAttentionMask<has_masking>;
+      using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaShape = FmhaFwdShape<MaxK>;
-      using FmhaTilePartitioner = FmhaFwdTilePartitioner<FmhaShape>;
-      constexpr ck::index_t occupancy =
+      using FmhaTilePartitioner = ck_tile::FmhaFwdTilePartitioner<FmhaShape>;
+      constexpr ck_tile::index_t occupancy =
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
+      constexpr auto kBiasEnum = kHasBias
+          ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
+          : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
+
       constexpr bool kPadSeqLenQ = true;
       constexpr bool kPadSeqLenK = true;
 
@@ -80,31 +69,32 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
       if (!use_async_pipeline) {
         BOOL_SWITCH_2(
             pad_headdim_q, kPadHeadDimQ, pad_headdim_v, kPadHeadDimV, [&] {
-              using FmhaTraits = ck::tile_program::TileFmhaTraits<
+              using FmhaTraits = ck_tile::TileFmhaTraits<
                   kPadSeqLenQ,
                   kPadSeqLenK,
                   kPadHeadDimQ,
                   kPadHeadDimV,
-                  kHasBias,
+                  kBiasEnum,
                   false, // kHasBiasGrad place-holder
                   false, // kStoreLSE
                   kHasDropout,
+                  false, // kDoFp8StaticQuant place-holder
                   occupancy>;
 
               using FmhaPipelineProblem =
                   FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
               using FmhaPipeline =
-                  ck::tile_program::block::BlockFmhaPipelineQRKSVS<
-                      FmhaPipelineProblem>;
+                  ck_tile::BlockFmhaPipelineQRKSVS<FmhaPipelineProblem>;
 
-              using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-                  typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-                  typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-                  kPadSeqLenQ,
-                  kPadHeadDimV>>;
+              using FmhaEpilogue =
+                  ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                      typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                      typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                      kPadSeqLenQ,
+                      kPadHeadDimV>>;
 
-              using FmhaKernel = FmhaFwdKernel<
+              using FmhaKernel = ck_tile::FmhaFwdKernel<
                   FmhaTilePartitioner,
                   FmhaPipeline,
                   FmhaEpilogue>;
@@ -112,31 +102,32 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
               RunWithKernel<FmhaKernel>(param, stream);
             });
       } else {
-        using FmhaTraits = ck::tile_program::TileFmhaTraits<
+        using FmhaTraits = ck_tile::TileFmhaTraits<
             true, // kPadSeqLenQ,
             kPadSeqLenK,
             true, // kPadHeadDimQ,
             true, // kPadHeadDimV,
-            kHasBias,
+            kBiasEnum,
             false, // kHasBiasGrad place-holder
             false, // kStoreLSE
             kHasDropout,
+            false, // kDoFp8StaticQuant place-holder
             occupancy>;
 
         using FmhaPipelineProblem =
             FmhaPipelineProblemTemp<FmhaTraits, FmhaMask>;
 
         using FmhaPipeline =
-            ck::tile_program::block::BlockFmhaPipelineQRKSVSAsync<
-                FmhaPipelineProblem>;
+            ck_tile::BlockFmhaPipelineQRKSVSAsync<FmhaPipelineProblem>;
 
-        using FmhaEpilogue = FmhaFwdEpilogue<FmhaFwdEpilogueProblem<
-            typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
-            typename FmhaFwdTypeConfig<ScalarType>::ODataType,
-            true,
-            true>>;
+        using FmhaEpilogue =
+            ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<
+                typename FmhaFwdTypeConfig<ScalarType>::OaccDataType,
+                typename FmhaFwdTypeConfig<ScalarType>::ODataType,
+                true,
+                true>>;
 
-        using FmhaKernel =
+        using FmhaKernel = ck_tile::
             FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
 
         RunWithKernel<FmhaKernel>(param, stream);
@@ -163,6 +154,8 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
           param.Hq, // nhead_q
           param.Hq / param.Hkv, // nhead_ratio_qk
           param.scale,
+          1.0f, // scale_p
+          1.0f, // scale_o
           param.q_strides[0], // q, k, v, bias, randval, out tensor seq-dim
                               // stride
           param.k_strides[0],
@@ -183,8 +176,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
-          1.0f, // descale_qk, not used
-          1.0f, // descale_sv, not used
           param.dropout_prob,
           false, // is_store_randval
           {param.philox_seed, param.philox_offset});
@@ -193,15 +184,12 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
     dim3 kGridSize = FmhaKernel::GridSize(
         param.num_batches, param.Hq, param.max_seqlen_q, param.Kv);
     constexpr dim3 kBlockSize = FmhaKernel::BlockSize();
-    constexpr ck::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
-
-    (void)launch_kernel<kBlockSize.x, kBlockPerCu>(
-        StreamConfig{stream, false},
-        FmhaKernel{},
-        kGridSize,
-        kBlockSize,
-        0,
-        kargs);
+    constexpr ck_tile::index_t kBlockPerCu = FmhaKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaKernel{}, kGridSize, kBlockSize, 0, kargs));
   };
 };
 
@@ -210,7 +198,7 @@ template <
     bool kHasCausalMask,
     bool kHasBias,
     bool kHasDropout,
-    ck::index_t MaxK>
+    ck_tile::index_t MaxK>
 void run_grouped_infer_causalmask_bias_dropout_dispatch(
     GroupedForwardParams& param,
     hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
similarity index 73%
rename from xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
rename to xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
index 80ef8a396..5b0fb5b37 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
@@ -4,101 +4,100 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
 // clang-format off
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, false, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::bhalf_t, true, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
-void grouped_infer_bp16(GroupedForwardParams& param, hipStream_t stream) {
+void grouped_infer_bf16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
   BOOL_SWITCH_2(param.has_attn_bias, kHasBias, has_dropout, kHasDropout, [&] {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_infer_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_infer_causalmask_bias_dropout_dispatch<
-            ck::bhalf_t,
+            ck_tile::bf16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index 73103a0e8..fa0a407f1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -4,84 +4,83 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include <ck/ck.hpp>
-#include <ck/utility/data_type.hpp>
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
 // clang-format off
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 32>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 64>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 128>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, true, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, false, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, true, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck::half_t, true, false, false, 256>(
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
     GroupedForwardParams& param, hipStream_t stream);
 // clang-format on
 
@@ -91,14 +90,14 @@ void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
     FMHA_FWD_HEADDIM_SWITCH(param.K, param.Kv, MaxK, [&] {
       if (param.custom_mask_type == 0)
         run_grouped_infer_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             false,
             kHasBias,
             kHasDropout,
             MaxK>(param, stream);
       else if (param.custom_mask_type == 1 || param.custom_mask_type == 2)
         run_grouped_infer_causalmask_bias_dropout_dispatch<
-            ck::half_t,
+            ck_tile::fp16_t,
             true,
             kHasBias,
             kHasDropout,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
index ccc8ae0ca..18814324b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
@@ -6,21 +6,22 @@
  */
 #pragma once
 
+#include <ck_tile/core.hpp>
 #include <stdexcept>
 
 #define FMHA_FWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
   [&] {                                                                \
     if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
-      constexpr ck::index_t CONST_NAME = 32;                           \
+      constexpr ck_tile::index_t CONST_NAME = 32;                      \
       __VA_ARGS__();                                                   \
     } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {                   \
-      constexpr ck::index_t CONST_NAME = 64;                           \
+      constexpr ck_tile::index_t CONST_NAME = 64;                      \
       __VA_ARGS__();                                                   \
     } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
-      constexpr ck::index_t CONST_NAME = 128;                          \
+      constexpr ck_tile::index_t CONST_NAME = 128;                     \
       __VA_ARGS__();                                                   \
     } else if (HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256) {                 \
-      constexpr ck::index_t CONST_NAME = 256;                          \
+      constexpr ck_tile::index_t CONST_NAME = 256;                     \
       __VA_ARGS__();                                                   \
     } else {                                                           \
       throw std::runtime_error("Head-dim sizes not supported!");       \
@@ -30,13 +31,13 @@
 #define FMHA_BWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
   [&] {                                                                \
     if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
-      constexpr ck::index_t CONST_NAME = 32;                           \
+      constexpr ck_tile::index_t CONST_NAME = 32;                      \
       __VA_ARGS__();                                                   \
     } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {                   \
-      constexpr ck::index_t CONST_NAME = 64;                           \
+      constexpr ck_tile::index_t CONST_NAME = 64;                      \
       __VA_ARGS__();                                                   \
     } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
-      constexpr ck::index_t CONST_NAME = 128;                          \
+      constexpr ck_tile::index_t CONST_NAME = 128;                     \
       __VA_ARGS__();                                                   \
     } else {                                                           \
       throw std::runtime_error("Head-dim sizes not supported!");       \
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
new file mode 100644
index 000000000..e930e0b82
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ck_tile/core.hpp>
+#include <ck_tile/ops/fmha.hpp>
+#include <ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp>
+#include <ck_tile/ops/gemm/pipeline/block_gemm_pipeline_problem.hpp>
+
+template <
+    ck_tile::index_t MPerBlockTile,
+    ck_tile::index_t NPerBlockTile,
+    ck_tile::index_t KPerBlockTile,
+    typename RandValOutputDataType,
+    bool kIsGroupMode>
+struct FmhaRandUniformKernel {
+  static constexpr ck_tile::index_t kBlockSize = 256;
+  static constexpr ck_tile::index_t kBlockPerCu = 1;
+
+  __device__ static constexpr auto GetBlockGemm() {
+    using namespace ck_tile;
+
+    using BlockGemmProblem_ = ck_tile::BlockGemmPipelineProblem<
+        ck_tile::fp16_t,
+        ck_tile::fp16_t,
+        float,
+        kBlockSize,
+        ck_tile::TileGemmShape<MPerBlockTile, NPerBlockTile, KPerBlockTile>>;
+
+    // using the default policy, which use M32xN32xK8 warp_tile
+    return ck_tile::BlockGemmARegBSmemCRegV2<BlockGemmProblem_>{};
+  };
+
+  using BlockGemm = decltype(GetBlockGemm());
+
+  static constexpr bool kPadSeqLenQ = true;
+  static constexpr bool kPadSeqLenK = true;
+
+  using BlockGemmShape =
+      ck_tile::remove_cvref_t<typename BlockGemm::BlockGemmShape>;
+  static constexpr ck_tile::index_t kMPerBlock = BlockGemmShape::kM;
+  static constexpr ck_tile::index_t kNPerBlock = BlockGemmShape::kN;
+
+  // kargs use aggregate initializer, so no constructor will provided
+  // use inheritance to minimize karg size
+  // user need to use MakeKargs() function to create kargs.
+  struct FmhaRandUniformCommonKargs {
+    void* rand_val_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+
+    ck_tile::index_t num_heads;
+    ck_tile::index_t num_batches;
+
+    ck_tile::index_t stride_seqlen_q;
+    ck_tile::index_t stride_seqlen_k;
+
+    ck_tile::index_t stride_nhead;
+
+    uint64_t seed = 1;
+    uint64_t offset = 0;
+  };
+
+  struct FmhaRandUniformBatchModeKargs : FmhaRandUniformCommonKargs {
+    ck_tile::index_t stride_batch;
+  };
+
+  struct FmhaRandUniformGroupModeKargs : FmhaRandUniformCommonKargs {
+    const int32_t* seqstart_q_ptr;
+    const int32_t* seqstart_k_ptr;
+    const int32_t* seqlen_k_ptr;
+  };
+
+  using Kargs = std::conditional_t<
+      kIsGroupMode,
+      FmhaRandUniformGroupModeKargs,
+      FmhaRandUniformBatchModeKargs>;
+
+  template <bool Cond = !kIsGroupMode>
+  __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(
+      void* rand_val_ptr,
+      ck_tile::index_t seqlen_q,
+      ck_tile::index_t seqlen_k,
+      ck_tile::index_t num_heads,
+      ck_tile::index_t num_batches,
+      ck_tile::index_t stride_seqlen_q,
+      ck_tile::index_t stride_seqlen_k,
+      ck_tile::index_t stride_nhead,
+      ck_tile::index_t stride_batch,
+      std::tuple<uint64_t, uint64_t> drop_seed_offset) {
+    Kargs kargs{
+        {rand_val_ptr,
+         seqlen_q,
+         seqlen_k,
+         num_heads,
+         num_batches,
+         stride_seqlen_q,
+         stride_seqlen_k,
+         stride_nhead,
+         std::get<0>(drop_seed_offset),
+         std::get<1>(drop_seed_offset)},
+        stride_batch};
+
+    return kargs;
+  }
+
+  template <bool Cond = kIsGroupMode>
+  __host__ static constexpr std::enable_if_t<Cond, Kargs> MakeKargs(
+      void* rand_val_ptr,
+      ck_tile::index_t seqlen_q,
+      ck_tile::index_t seqlen_k,
+      ck_tile::index_t num_heads,
+      ck_tile::index_t num_batches,
+      ck_tile::index_t stride_seqlen_q,
+      ck_tile::index_t stride_seqlen_k,
+      ck_tile::index_t stride_nhead,
+      const void* seqstart_q_ptr,
+      const void* seqstart_k_ptr,
+      const void* seqlen_k_ptr,
+      std::tuple<uint64_t, uint64_t> drop_seed_offset) {
+    Kargs kargs{
+        {rand_val_ptr,
+         seqlen_q,
+         seqlen_k,
+         num_heads,
+         num_batches,
+         stride_seqlen_q,
+         stride_seqlen_k,
+         stride_nhead,
+         std::get<0>(drop_seed_offset),
+         std::get<1>(drop_seed_offset)},
+        reinterpret_cast<const int32_t*>(seqstart_q_ptr),
+        reinterpret_cast<const int32_t*>(seqstart_k_ptr),
+        reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
+
+    return kargs;
+  }
+
+  __host__ static constexpr auto GridSize(
+      ck_tile::index_t batch_size_,
+      ck_tile::index_t nhead_,
+      ck_tile::index_t seqlen_q_,
+      ck_tile::index_t seqlen_k_) {
+    (void)seqlen_k_; // not used at present
+
+    // at present, seqlen_k is not splitted by thread-groups
+    return dim3(
+        ck_tile::integer_divide_ceil(seqlen_q_, kMPerBlock),
+        nhead_,
+        batch_size_);
+  }
+
+  __device__ static constexpr auto GetTileIndex(
+      ck_tile::index_t seqlen_q_,
+      ck_tile::index_t seqlen_k_) {
+    (void)seqlen_q_; // not used at present
+    (void)seqlen_k_; // not used at present
+
+    const ck_tile::index_t i_block = blockIdx.x;
+    const ck_tile::index_t i_nhead = blockIdx.y;
+    const ck_tile::index_t i_batch = blockIdx.z;
+
+    return ck_tile::make_tuple(i_block, i_nhead, i_batch);
+  }
+
+  __host__ static constexpr auto BlockSize() {
+    return dim3(kBlockSize);
+  }
+
+  __device__ static constexpr ck_tile::index_t GetSmemSize() {
+    return ck_tile::BlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>()
+        .get_element_space_size();
+  }
+
+  template <typename RandValDramBlockWindowTmp>
+  __device__ void main_loop(
+      const Kargs& kargs,
+      const ck_tile::philox& ph,
+      void* randval_smem_ptr,
+      RandValDramBlockWindowTmp& randval_dram_block_window_tmp) const {
+    using namespace ck_tile;
+
+    auto randval_dram_window = BlockDropout::MakeRandvalDramWindow<BlockGemm>(
+        randval_dram_block_window_tmp, 0);
+
+    const auto num_total_loop =
+        ck_tile::integer_divide_ceil(kargs.seqlen_k, kNPerBlock);
+    index_t i_total_loops = 0;
+
+    do {
+      constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<
+          typename BlockGemm::Problem>();
+      using WG = remove_cvref_t<decltype(config.template at<0>())>;
+      constexpr index_t MWarp = config.template at<1>();
+      constexpr index_t NWarp = config.template at<2>();
+      constexpr index_t kMPerStep = MWarp * WG::kM;
+      constexpr index_t kNPerStep = NWarp * WG::kN;
+
+      // randval tile in LDS
+      auto randval_lds = make_tensor_view<address_space_enum::lds>(
+          reinterpret_cast<uint8_t*>(randval_smem_ptr),
+          BlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>());
+
+      auto randval_lds_window = make_tile_window(
+          randval_lds,
+          BlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>()
+              .get_lengths(),
+          {0, 0});
+
+      // register distribute
+      auto randval_dist_generated = make_static_distributed_tensor<uint8_t>(
+          BlockDropout::MakeRandValTileDistribution<BlockGemm>());
+
+      static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
+
+      auto randval_lds_read_window = make_tile_window(
+          randval_lds_window.get_bottom_tensor_view(),
+          randval_lds_window.get_window_lengths(),
+          randval_lds_window.get_window_origin(),
+          BlockDropout::MakeRandValLdsShuffleTileDistribution<BlockGemm>());
+
+      const int start_m0_idx =
+          randval_dram_window.get_window_origin().at(number<0>{});
+      const int start_n0_idx = i_total_loops * kNPerBlock;
+
+      static_for<0, kMPerBlock / kMPerStep, 1>{}([&](auto i_m0) {
+        static_for<0, kNPerBlock / kNPerStep, 1>{}([&](auto i_n0) {
+          const auto [block_row_start, block_col_start] = [&]() {
+            if constexpr (MWarp > 1) {
+              int block_row_start_ =
+                  (start_m0_idx / WG::kM) + (i_m0 * MWarp) + get_warp_id();
+              int block_col_start_ = start_n0_idx / WG::kN + i_n0;
+              return make_tuple(block_row_start_, block_col_start_);
+            } else {
+              int block_row_start_ = (start_m0_idx / WG::kM) + i_m0;
+              int block_col_start_ =
+                  (start_n0_idx / WG::kN) + (i_n0 * NWarp) + get_warp_id();
+              return make_tuple(block_row_start_, block_col_start_);
+            };
+          }();
+
+          uint2 rowcol = make_uint2(block_row_start, block_col_start);
+
+          // generate random number
+          uint8_t random_uint8_t[16];
+          ph.get_random_16x8(
+              random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
+
+          constexpr auto randval_dist_generated_spans =
+              decltype(randval_dist_generated)::get_distributed_spans();
+          int i_random_idx = 0;
+          sweep_tile_span(
+              randval_dist_generated_spans[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(
+                    randval_dist_generated_spans[number<1>{}], [&](auto idx1) {
+                      constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                      randval_dist_generated(i_j_idx) =
+                          random_uint8_t[i_random_idx++];
+                    });
+              });
+          // save to LDS
+          store_tile(randval_lds_window, randval_dist_generated);
+          block_sync_lds();
+          // read from LDS to register
+          auto randval = load_tile(randval_lds_read_window);
+          // save to Global
+          const auto randval_store = cast_tile<RandValOutputDataType>(randval);
+          store_tile(randval_dram_window, randval_store);
+          move_tile_window(randval_dram_window, {0, kNPerStep});
+        });
+        move_tile_window(randval_dram_window, {kMPerStep, -kNPerBlock});
+      });
+
+      move_tile_window(randval_dram_window, {-kMPerBlock, kNPerBlock});
+
+    } while (++i_total_loops < num_total_loop);
+  }
+
+  __device__ void operator()(Kargs kargs) const {
+    using namespace ck_tile;
+
+    // allocate LDS
+    __shared__ char smem_ptr[GetSmemSize()];
+
+    // divide problem
+    const auto [i_tile_m, i_nhead, i_batch] =
+        GetTileIndex(kargs.seqlen_q, kargs.seqlen_k);
+
+    const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kMPerBlock);
+
+    long_index_t batch_offset_randval = 0;
+
+    if constexpr (kIsGroupMode) {
+      // get starting offset for each batch
+      const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+
+      batch_offset_randval = query_start * kargs.stride_seqlen_q;
+
+      // get real # queries & # keys under group mode
+      const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+      kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+      if (kargs.seqlen_q <= i_m0) {
+        return;
+      }
+
+      if (kargs.seqlen_k_ptr != nullptr) {
+        kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+      } else {
+        const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+        kargs.seqlen_k =
+            adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+      }
+    } else {
+      batch_offset_randval =
+          static_cast<long_index_t>(i_batch) * kargs.stride_batch;
+    }
+
+    constexpr auto randval_dram_window_lengths =
+        make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{});
+
+    RandValOutputDataType* rand_val_ptr =
+        reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
+        static_cast<long_index_t>(i_nhead) * kargs.stride_nhead +
+        batch_offset_randval;
+
+    const auto randval_dram = [&]() {
+      const auto randval_dram_naive =
+          make_naive_tensor_view<address_space_enum::global>(
+              rand_val_ptr,
+              make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+              make_tuple(kargs.stride_seqlen_q, kargs.stride_seqlen_k),
+              number<1>{},
+              number<1>{});
+
+      return pad_tensor_view(
+          randval_dram_naive,
+          randval_dram_window_lengths,
+          ck_tile::sequence<kPadSeqLenQ, kPadSeqLenK>{});
+    }();
+
+    auto randval_dram_block_window_tmp =
+        make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
+
+    ck_tile::philox ph(
+        kargs.seed,
+        kargs.offset + (i_batch * kargs.num_heads + i_nhead) * get_warp_size() +
+            get_lane_id());
+
+    main_loop(kargs, ph, smem_ptr, randval_dram_block_window_tmp);
+  }
+};
diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index f835ad82f..9640752fa 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -9,7 +9,7 @@
 
 FMHA_INSTANCE_HEADER = """
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -19,7 +19,7 @@
 """
 
 FMHA_INFER_INSTANCE_TEMPLATE="""
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_infer.h\"
 
 template void run_{mode}_infer_causalmask_bias_dropout_dispatch<
@@ -33,7 +33,7 @@
 FMHA_INFER_INSTANCE_FNAME="fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
 FMHA_FORWARD_INSTANCE_TEMPLATE="""
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_forward.h\"
 
 template void run_{mode}_forward_causalmask_bias_dropout_dispatch<
@@ -47,7 +47,7 @@
 FMHA_FORWARD_INSTANCE_FNAME="fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
 FMHA_BACKWARD_INSTANCE_TEMPLATE="""
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_backward.h\"
 
 template void run_{mode}_backward_causalmask_bias_dropout_dispatch<
@@ -94,8 +94,13 @@
 }
 
 TYPE_CTYPE_MAP = {
-   "fp16" : "ck::half_t", 
-   "bp16" : "ck::bhalf_t",
+   "fp16" : "ck_tile::fp16_t", 
+   "bf16" : "ck_tile::bf16_t",
+}
+
+TYPE_FNAME_MAP = {
+   "fp16" : "half", 
+   "bf16" : "bfloat16",
 }
 
 MODE_NAME_MAP = {
@@ -105,7 +110,7 @@
 
 def create_infer_instances(instance_dir: Path) -> None:
      for mode in ["batched", "grouped"]: 
-         for dtype in ["fp16", "bp16"]: 
+         for dtype in ["fp16", "bf16"]: 
              for has_causalmask in [True, False]: 
                  for has_bias in [True, False]:
                      for has_dropout in [True, False]:
@@ -120,6 +125,7 @@ def create_infer_instances(instance_dir: Path) -> None:
                                       )
                               infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
                                       mode=mode,
+                                      dtype_file=TYPE_FNAME_MAP[dtype],
                                       dtype=TYPE_CTYPE_MAP[dtype], 
                                       has_causalmask=BOOL_MAP[has_causalmask],
                                       has_bias=BOOL_MAP[has_bias],
@@ -131,7 +137,7 @@ def create_infer_instances(instance_dir: Path) -> None:
 
 def create_forward_instances(instance_dir: Path) -> None:
      for mode in ["batched", "grouped"]:
-         for dtype in ["fp16", "bp16"]:
+         for dtype in ["fp16", "bf16"]:
              for has_causalmask in [True, False]:
                  for has_bias in [True, False]:
                      for has_dropout in [True, False]:
@@ -146,6 +152,7 @@ def create_forward_instances(instance_dir: Path) -> None:
                                       )
                               infer_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
                                       mode=mode,
+                                      dtype_file=TYPE_FNAME_MAP[dtype],
                                       dtype=TYPE_CTYPE_MAP[dtype],
                                       has_causalmask=BOOL_MAP[has_causalmask],
                                       has_bias=BOOL_MAP[has_bias],
@@ -157,7 +164,7 @@ def create_forward_instances(instance_dir: Path) -> None:
 
 def create_backward_instances(instance_dir: Path) -> None:
      for mode in ["batched", "grouped"]:
-         for dtype in ["fp16", "bp16"]:
+         for dtype in ["fp16", "bf16"]:
              for has_causalmask in [True, False]:
                  for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
                      for has_dropout in [True, False]:
@@ -173,6 +180,7 @@ def create_backward_instances(instance_dir: Path) -> None:
                                       )
                               infer_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
                                       mode=mode,
+                                      dtype_file=TYPE_FNAME_MAP[dtype],
                                       dtype=TYPE_CTYPE_MAP[dtype],
                                       has_causalmask=BOOL_MAP[has_causalmask],
                                       has_bias=BOOL_MAP[has_bias],
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index f47ea8913..97f209cb6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 80872bc87..5c0e89e21 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 1b7eb3fa1..5e3392493 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 752e5a535..ae9158e21 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
     true,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index b7183ced4..dfc929276 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index a10d6a1bc..a915f8aa5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
-    false,
     true,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 70d77321e..7e17c9298 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 2296da150..8d980af34 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
+    false,
     true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 0a5135581..be31aa59b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     true,
     true,
     false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index b3a40e957..7ea9cb0a9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    true,
     true,
     false,
     false,
-    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 27ab35a1b..a2a9dd4d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
+    false,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d2508d993..594a62ff5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 489bdd9a5..0307f9ab2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 25b8ae47d..5a7cd479a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 5100ac96b..e1280f6d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
-    true,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 795744d65..04a107af4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
     false,
+    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 40a92b384..0a41a2f27 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index aac83e1bb..49d6b9641 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index fbcbc8673..f5ce7c5bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
     true,
-    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 946da70a2..41ff265c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 68876d1ee..f6b776650 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 74a45b99b..7f4013aaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
     false,
-    false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 1c7f28a08..5241a1b1f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index ac8b00115..f5ee944eb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    true,
+    false,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index dcb2b0696..8ab3f930c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
-    true,
     false,
+    true,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 002b30ee5..c757b7d35 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
+    true,
     32>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 0c4b5c1b6..4b3d9f256 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
+    true,
     64>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index f4ab60aed..03455ee6e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
     false,
     128>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 7a45b95db..48a501539 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index f98cac80b..d73c780a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 5d626588b..c0636a905 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index babf14605..3da3474df 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 47eed928b..6ed11608d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index de13cdfa0..3cca920f5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index ffaf66bdf..6383d494e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 53446d60e..585dc69f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 78e737557..6ca73178d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 6253cb013..95218766e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 0d4a36823..bf092ff96 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 0075f69c4..394bbbe28 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 7988f3f3a..ea3884557 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index a87360605..4596bfd7f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 2dd378e56..e1d72bc58 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 5882f0f74..96f62e9ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 4e8f74579..dd72c62f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 56f4ef231..a0d7a83d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 3fe231753..e2d01f97e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index ea591609a..d5378b3f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 465e3974e..02c8c9bc5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index cf441573a..8057c759e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 5bca9b8ae..af6091b25 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 6312622ff..3fc748ff2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index dc425e9db..b9b6aacfe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 3fbea87ee..8b667d2f7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index ce9e7d257..df1e6c3c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index f93820dbb..f415d9464 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 07dabfa5f..ff8d33f21 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 852b0339d..41da7ab90 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 4874e14aa..340fb65ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 0036596a5..be7f2144d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index eea9ea776..0932fbb12 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 070ddddd6..eaafd9949 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index ad72c8f1a..02cf83aba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 99a3acd4f..51bd8bedb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 89e517e75..7f999c203 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 9120025dd..3ad410861 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 419a240bd..90572aabf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index d9d4eaba9..9c0000820 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index a1bcfbd2b..13902640d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index d86f207d9..82849155e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 2fa1e6493..81636cea6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 2b9e3daef..97775f0e2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 2237719c1..5a639ee11 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 24b717342..29cf57025 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index d9333c0dc..c60d415d4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 2fbb4d47c..f6291e2db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 1d79adfb8..caec04c71 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 66c4450b6..ae29f02a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 8d6bc812f..71eda93e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 728b653c6..aa31f0f84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 5b609eb20..551c4eb67 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     true,
     false,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 6fe3e9c9a..1d6e78baf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
+    false,
     true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 90d4de433..278f6d358 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
+    false,
     true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index cd43accf2..18e12c0a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 1c620930e..d393e26c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 5dd149303..e5e99ede0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 32c7ea50f..672b58be1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 8f41bf550..ed42d7c0b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 6af1255c3..7e71f6b27 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     true,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 6d08b4bb7..5f0af8c18 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 6daa3edac..3aac80d51 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 2e654d8a1..8018e467f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 063359755..0266d3a36 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 2a3207554..d327faf63 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 49108e76d..af2c6e8de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 4e19f3be9..722dc77bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 8d3003cde..9ab840b67 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index f28877eeb..6b6c4b6a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 3da70de62..afd3bcfc3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index ffc65eed8..a349964c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 4a4f30052..03eb236cc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 436b9099f..19dc010e4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 5ab62c09b..14272770f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index f1c11f424..bf7aefc53 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index db8135481..6e2e94259 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 814b9d8ea..e08bb00a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 6576c4e2d..96de7b864 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 4bf477d19..f82f2b471 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 310a03420..60eda29ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index fda6ea614..9cb7c591b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 121d264a3..effc47a63 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index ca98bf25a..477ec5f36 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index a4881489d..b75a4f46f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 7a8d21150..322d9c2e2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 2d8c78b9e..77fb6a604 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index db9d24e33..57214e6f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index e917e4574..3b4f1be34 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 170647a65..afc858efb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index acdb267fd..bdf207633 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 14c01441b..ea656db19 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index c87a853a4..5d65d7ae7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 62d6f3f14..709138805 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 73dc87fc1..c50e52c86 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index dacb7ed77..1808842fc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index f535ef4f6..367c420a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index de1bbe73f..8f213bfef 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index ad9d39793..fd5da6b77 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 5f040fa03..70e0723bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index c6171c350..4f8e39ac1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 5518daba3..3d3be36e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 0607c2325..21aae8f7c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index e0e156802..514a01a39 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 22082a993..c67d1c653 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index e52ed1a52..810036325 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 37bee2973..7dda46c89 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 3deec3078..2392b9498 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 8923f4008..74743b024 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index c21f4dcdd..20290bab8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 40483eab7..ab3225bd4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 319648375..310442726 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index b0928ecfc..af36d315e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 98f6d6723..b25e1be08 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 164c45405..5e660a8ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 640f9fe2d..39153d92f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 9597383c9..bf3c3f21a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index fe8993be4..e9c1c0551 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 627f4ea61..e35a1e7a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
+    false,
     true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 7f7f9af7d..577972843 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index fabe89504..bb48b49d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index ca31525f0..d13429529 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 59474b191..5d44df43a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 802214815..aadd0fcca 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index c101ff149..034275f69 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 990cc05ce..c922b00c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index f15d45e69..8edd6fed5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index c7263bc26..e2d8ba101 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 9bc056102..9e9adf31d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 001805e8a..306829eaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 3384be9d3..8bfc62104 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index be5ece1fd..fe81acab4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index a73c01e2e..bcf5b783f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     128>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index e7234ebc2..ba5a41450 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 64dbc7049..9cac1c3af 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     32>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 5a609eaf0..e31ed4362 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     64>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index ccf7cb80b..9f52f52be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 4d13af6bc..9ba93c82c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 2b8202b53..fec45193d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 38fe474db..571f8ad48 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 3a03e2ed1..76447cfef 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 74cf62de8..94e2e0dfc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3d17dc729..432d955b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 49ef6a3ed..173d18aaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 6e9e3b2ab..7661a50d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 1980128a2..b3e43957f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index cefda7208..f54aa9ef4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 718293285..17f4018c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index f45e10da9..d5ea02d7c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 8c8d08f52..2e4a6769e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 59ac4bc28..6caae1a75 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index edff64b7b..c01f1105b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index b27270cc4..4e146ec41 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 34a7b746f..e5bc54c2c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index c8d2c42e1..ac3f5d082 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 747ad6cf2..3f39b0323 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 83cdbd0e3..7440bc503 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index e72ef8963..efaf98472 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 1269c0e74..0820075e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 55a152e43..89dace195 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index a348774eb..95f57c099 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 95a57bb7d..c8ac55329 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 5573f81b1..10a261f3d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index c8eaea6a6..721145717 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 347120778..be3100082 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index b3542bbf9..7c70e53b9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 829f61029..75f733259 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index a5c71f3a2..50507e69c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 51dd2f78f..931040548 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 51c34e651..a1a08d4d5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 700f9acfd..200706066 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 4d43ed9b5..9db040363 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index f6d0af717..72fec2837 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index a73f1e9e9..5b3551d3b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 2e186f3ba..c9ca1a559 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 307acb781..09daabcfa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
     true,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 9e278d05d..0bc605677 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 95cd67300..489610171 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
-    false,
     true,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 120ced112..3e9ba0cba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 875c36554..3e13c1b17 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
+    false,
     true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 452f5ac0c..b5023fdc8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     true,
     true,
     false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 3e125e542..7c3a7a165 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    true,
     true,
     false,
     false,
-    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 7cf70379f..73cd48382 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
+    false,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d47bb845b..f9163241f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 42be3cb81..55fa67c3d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 4323e2902..3549f1148 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 1228d91c3..e8735e590 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
-    true,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 87da66276..43586d91c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
     false,
+    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index a4fe43dd5..6e6e44a15 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d875a8cb9..16c69fc8f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 5ebed8c73..c590ef5a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
     true,
-    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index cbdac868f..6e283c09f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index d5e242fec..6d3aebee2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 8da955f15..62da5b2b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
     false,
-    false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index adaee823c..28184d919 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index eb4713c43..a1cdf5607 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    true,
+    false,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index fc0636bb7..36a047ac7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
-    true,
     false,
+    true,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index c77696023..3930123b2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
+    true,
     32>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 4527adc28..60bd6d5c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
+    true,
     64>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 35041c002..549983dc4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,13 +8,13 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
     false,
     128>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 1a67c23b7..8c32f736f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index bd7697091..e4a8919eb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 115f80da5..d88c4a1e0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 31ee39fb2..8aeb02787 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 258db9fce..a41d5eace 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index b848cecf7..324e1f0d0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 89da82e0f..630e0f72c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
similarity index 75%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 41d42b992..b2b7066df 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index cde7b8f08..9f7544038 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index c2298cb86..ab6c752ab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 8342afa37..988114605 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 834b1d625..539311424 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 0656ea175..34dd66471 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 6bb731da4..88305d7de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index fb458f74c..4ff2f792b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 9536035d6..9534a7f50 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 666ae6242..906dcd51b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index d24d3d0f9..926aadb7f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 82740f8dd..5c29ff3c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 7cfa9ecab..75684001a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 0f12efbed..13e995979 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 88d34ede5..d41ee2d19 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index ed0c9af4d..702a3bf4f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 597c93939..b450ef78d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 0fe702a09..be18be183 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index e5ab9b62c..b93c05261 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 582dd07ae..fc26a3025 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 4cf3d362e..841cc31e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 3c0e08ef5..f2865241c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index be449dddb..35edebe38 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 8e56f25d3..8e0d32d5a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index c4ed120c0..573ec892b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 05ccb961b..33f9cace9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index ab7a421fc..683918a99 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 810225ab7..e0c419d2f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 2f5ad17f5..52e41c45d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 590b22987..acdf13265 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 07d372940..6729d5917 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index c65c96f5d..072115903 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index e4aa0ac8a..64ff3db39 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 63d619d8d..f3acd7e17 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 905448129..d78c56731 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index a5c107a93..06dc769b9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index a9245471c..63928f3a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 780d6bc5d..55e21c75a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 597de4543..7c1c89f54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 5608da950..9453c7d2c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index e67cfe516..888c865cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 809a3597b..1e1231370 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index c027178b7..03625b779 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 0f0174653..b99a04d7a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 2532a0074..12c1b6a90 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 70657a16c..42a6cea30 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     true,
     false,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index ecfe07e63..81d679689 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
+    false,
     true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 4a1b10da6..e614abdaa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
+    false,
     true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 1ce86be18..339f99255 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 6a65e56bb..64b61826f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 95fc499b1..4983a4ac1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index e898330a9..fa7649dea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index f6ebe8228..3a24474ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index f404b2974..57e895ae9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     true,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index e62a0cdfc..b975fa34c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 1378e8bbe..3be314a73 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 301590433..733debc01 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index cf15fa390..b762d178c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 5677ead04..7d8648a26 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 5cd3ef7d9..28a21d93f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
+    true,
     false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 70f34bc04..2fe0721c6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 6ef0db716..159489e9d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 1da195796..507aabe2d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 53c4b4f84..db7d8ed17 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 13cae6aea..c95898882 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index c74bdd1da..4c5395bed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 79ad692ce..487acd8fa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index c44fe5e4e..913d55757 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 151d072b2..137da7aaf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 3cbe18117..68a75552a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 65fd33d2d..0603f0d1c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index cb9498401..2ba93fcc1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 7ddd09ca5..4f95470a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 1c5e308f6..c12483acf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 1a674ad11..d2bb3b0f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 60d724d37..76752b2e6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 9c1268211..2658965bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 0972c088b..3715f9e40 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index c7bee6428..df210e2b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 0dfdb53bc..0acee7775 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index bb1cf0032..91e6d0778 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index c9d7245e9..4c2b6ca25 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 13cf18b74..5a2df731e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 1d10b1934..2492c47ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 239cfdcb7..7cd86ff79 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 0417713d5..892446459 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 917fee0d4..e6914af9d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 45c72d311..3acb390fe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 11ef78e80..b395d5671 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 9d258a09e..a65035381 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 63c04b163..547fef8b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 38c0fdfb7..8ec916502 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 7620830c3..1f3195d6e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index ca03aa0a8..1498a7d09 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 0f8d631d1..858d55e00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 9aca2c81e..72b4db4f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index f61fe5eeb..237cbc71c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index a6523f6fd..a40d4a3a3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index c45de9a85..9fb5462a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index aa482cddc..832ee6f82 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 32c319a50..beaaaf75a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 018cb72be..23927f896 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index faabed60a..7e0495247 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index c920dff22..59224bc65 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 4e8d812c8..2917ab5d0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 06e096f9d..ea651303e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index f2611fd2c..f1b6c2762 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 3b5614f0e..631b007f7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
-    false,
     true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 93211cdd1..6bf62e163 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index e3a658748..e9d80dcba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 3fa6d85bd..629111cc2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 4909cfa45..03a582a51 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
+    false,
     true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index b33221834..8866842c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index ad7ce669e..0fc722d97 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 83e19ecfc..d7654bcdb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index a1c40a7f2..aa8b341c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 37b634b55..14d6da36b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     true,
     false,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 055c3ddf6..2f4a65c57 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index bdee87bc7..f7f7bde51 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 489521a75..3833d791c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
+    false,
     true,
     true,
-    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 4705a9d4e..b2c7d4be1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 85f34fba8..ab22cec47 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 69835203f..198837822 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 7fa077699..45d86f18a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index dc34c1a04..be4cceb0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    false,
+    ck_tile::bf16_t,
     false,
     true,
+    false,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 0af311aa8..af14ace8f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     128>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index d68e89d55..00fbb2563 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index ea765be5e..e7c4b053e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     32>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index ee1dbceea..c9d263f8f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,12 +8,12 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
-    true,
+    ck_tile::bf16_t,
     false,
     false,
+    true,
     64>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 5d75d9437..da5ce48b5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 9af2dd0ac..4cac3c509 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 92bc89ea5..eacbac287 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
similarity index 74%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index a2b3fd2a3..e33f52717 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::bhalf_t,
+    ck_tile::bf16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 916786bff..c604204d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index dac24a533..f4623e664 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index c99321f42..cb44bd3e6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 306b2de2a..0f0e5290d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 5a8431fe5..9b486ea34 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 29d76c352..2154e1485 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 9475e9edd..4d526353a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index adb2f5ad1..bc14f586d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 524a21c34..98567089a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 12eb1d0e5..26211bc69 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 26f6190d8..72722bcf8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 111473c7e..c706a640c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 9adb10a8c..58107a965 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 6b7f35fa4..2b2c794f5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index e89cffda5..e8e3110f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 7b4552d93..c50ad6f4e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     true,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 734b7e5a0..60e20d744 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 2644e4796..e4eeebfcb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index cba7af09d..4b54aa562 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 1755388bb..66e02cd50 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 24074346e..1c42f4206 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 609ee02ec..46b4bd288 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 56debfe4d..2ec8996f4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 454733419..5e2a114a7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     true,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index de325b10c..88ad1f8dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 40754cdd3..c536e0970 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 9e27756bf..0c927196b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 4000c08c5..e84f94f35 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 089d46191..94db8d5d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 6a6e96ff8..61abbbf36 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index fb8604451..2a7b8f256 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 6a1ae5649..d5b1bd180 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,11 +8,11 @@
  * The file is automatically generated, don't modify!
  */
 
-#include <ck/ck.hpp>
+#include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
-    ck::half_t,
+    ck_tile::fp16_t,
     false,
     false,
     false,

From 76fb48524219d47190aefb8f814095e25a24a4a8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 13 Jun 2024 16:24:35 +0000
Subject: [PATCH 551/641] Synchronize composable_kernel_tiled to latest ck
 develop

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 6e56bcb9c..b642ad5b9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = develop-xformers-test 
+	branch = develop 
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index ed3a957f1..37a347e38 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit ed3a957f1c49b6ac280e52d96dcceac920e582d9
+Subproject commit 37a347e3807198400d6ee1c8401f7c2cbb1d426e

From 1f3add7f6b4cf6d7faf2111ca1870df2dd85775a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 14 Jun 2024 09:39:16 +0000
Subject: [PATCH 552/641] Use FmhaFwdTilePartitioner_HBS only with seqlen_k
 padded cases

---
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  | 29 ++++++++----
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    | 47 ++++++++++++++-----
 2 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 55609fd9f..802d2faea 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -49,8 +49,7 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
       using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
-      using FmhaFwdTilePartitioner_ =
-          ck_tile::FmhaFwdTilePartitioner<FmhaFwdShape_>;
+
       constexpr ck_tile::index_t occupancy =
           (MaxK == 64) ? 3 : (MaxK == 256) ? 1 : 2;
 
@@ -92,12 +91,26 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
                     kPadSeqLenQ,
                     kPadHeadDimV>>;
 
-            using FmhaFwdKernel_ = ck_tile::FmhaFwdKernel<
-                FmhaFwdTilePartitioner_,
-                FmhaFwdPipeline_,
-                FmhaFwdEpilogue_>;
-
-            RunWithKernel<FmhaFwdKernel_>(param, stream);
+            if (param.seqlen_k_dev_ptr !=
+                nullptr) { // seqlen_k of batches are padded
+              using FmhaTilePartitioner =
+                  ck_tile::FmhaFwdTilePartitioner_HBS<FmhaFwdShape_>;
+              using FmhaFwdKernel_ = ck_tile::FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaFwdPipeline_,
+                  FmhaFwdEpilogue_>;
+
+              RunWithKernel<FmhaFwdKernel_>(param, stream);
+            } else {
+              using FmhaTilePartitioner =
+                  ck_tile::FmhaFwdTilePartitioner_SHB<FmhaFwdShape_>;
+              using FmhaFwdKernel_ = ck_tile::FmhaFwdKernel<
+                  FmhaTilePartitioner,
+                  FmhaFwdPipeline_,
+                  FmhaFwdEpilogue_>;
+
+              RunWithKernel<FmhaFwdKernel_>(param, stream);
+            }
           });
     });
   };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index f66eeb436..5197a6cb1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -50,7 +50,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
       using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
       using FmhaShape = FmhaFwdShape<MaxK>;
-      using FmhaTilePartitioner = ck_tile::FmhaFwdTilePartitioner<FmhaShape>;
       constexpr ck_tile::index_t occupancy =
           (MaxK == 64) ? 3 : ((MaxK == 256) ? 1 : 2);
 
@@ -94,12 +93,26 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
                       kPadSeqLenQ,
                       kPadHeadDimV>>;
 
-              using FmhaKernel = ck_tile::FmhaFwdKernel<
-                  FmhaTilePartitioner,
-                  FmhaPipeline,
-                  FmhaEpilogue>;
-
-              RunWithKernel<FmhaKernel>(param, stream);
+              if (param.seqlen_k_dev_ptr !=
+                  nullptr) { // seqlen_k of batches are padded
+                using FmhaTilePartitioner =
+                    ck_tile::FmhaFwdTilePartitioner_HBS<FmhaShape>;
+                using FmhaKernel = ck_tile::FmhaFwdKernel<
+                    FmhaTilePartitioner,
+                    FmhaPipeline,
+                    FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              } else {
+                using FmhaTilePartitioner =
+                    ck_tile::FmhaFwdTilePartitioner_SHB<FmhaShape>;
+                using FmhaKernel = ck_tile::FmhaFwdKernel<
+                    FmhaTilePartitioner,
+                    FmhaPipeline,
+                    FmhaEpilogue>;
+
+                RunWithKernel<FmhaKernel>(param, stream);
+              }
             });
       } else {
         using FmhaTraits = ck_tile::TileFmhaTraits<
@@ -127,10 +140,22 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
                 true,
                 true>>;
 
-        using FmhaKernel = ck_tile::
-            FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
-
-        RunWithKernel<FmhaKernel>(param, stream);
+        if (param.seqlen_k_dev_ptr !=
+            nullptr) { // seqlen_k of batches are padded
+          using FmhaTilePartitioner =
+              ck_tile::FmhaFwdTilePartitioner_HBS<FmhaShape>;
+          using FmhaKernel = ck_tile::
+              FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+          RunWithKernel<FmhaKernel>(param, stream);
+        } else {
+          using FmhaTilePartitioner =
+              ck_tile::FmhaFwdTilePartitioner_SHB<FmhaShape>;
+          using FmhaKernel = ck_tile::
+              FmhaFwdKernel<FmhaTilePartitioner, FmhaPipeline, FmhaEpilogue>;
+
+          RunWithKernel<FmhaKernel>(param, stream);
+        }
       }
     });
   };

From 9df93e5ff8faa816b643326ab32f84add384e0f3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 Jun 2024 19:06:20 +0000
Subject: [PATCH 553/641] Tiny fix/change to make
 test_forward/test_backward/test_dropout/test_dropout_backward_ck pass

---
 setup.py                        | 2 +-
 tests/test_mem_eff_attention.py | 8 ++++++--
 xformers/ops/fmha/ck.py         | 9 +++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 327e1f7df..74d5b9cd7 100644
--- a/setup.py
+++ b/setup.py
@@ -434,7 +434,7 @@ def get_extensions():
                 "-DCK_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
                 "-Werror",
-                "-Woverloaded-virtual",
+                ##"-Woverloaded-virtual",
             ]
             + generator_flag
             + cc_flag,
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index acfec797d..16a4b361c 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -967,7 +967,7 @@ def test_backward(
     )
 
     if op_bw == fmha.ck.BwOp:
-        op_fwd = fmha.ck.FwOp
+        op_fw = fmha.ck.FwOp
         if dtype == torch.bfloat16:
             pytest.skip("CK Fmha backward for bfloat16 currently is not very accurate for some cases!")
         if grad_out_contiguous == False:
@@ -1170,7 +1170,11 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     torch.manual_seed(seed)
     mask = _get_drop_mask(op, batch_size, q_len, kv_len, p, device)
     ref = ref_attention_for_test(query, key, value, attn_bias, mask, p)
-    assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
+
+    if dtype is torch.float:
+        assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
+    else:
+        assert_allclose(out.float(), ref, atol=2.2e-2), f"{(out - ref).abs().max()}"
 
     num_trials = 1000
     p_val_tol = 1e-6
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 5046b7fc4..79780e093 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -15,6 +15,7 @@
 from . import attn_bias
 from .attn_bias import (
     AttentionBias,
+    AttentionBiasSubTensor,
     BlockDiagonalCausalLocalAttentionFromBottomRightMask,
     BlockDiagonalCausalLocalAttentionMask,
     BlockDiagonalCausalMask,
@@ -65,13 +66,13 @@ def _get_seqlen_info(
 def _get_tensor_bias(
     attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
 ) -> Optional[torch.Tensor]:
-    if isinstance(attn_bias, torch.Tensor):
+    if isinstance(attn_bias, AttentionBiasSubTensor):
+        if isinstance(attn_bias, LowerTriangularMaskWithTensorBias):
+            return attn_bias._subtensor
+    elif isinstance(attn_bias, torch.Tensor):
         return attn_bias
-    elif isinstance(attn_bias, LowerTriangularMaskWithTensorBias):
-        return attn_bias._subtensor
     return None
 
-
 def _check_bias_alignment(
     reasons: List[str], attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
 ) -> None:

From d6ccfa1a63a70f9ff0800d40e168cc9596121051 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 Jun 2024 20:17:27 +0000
Subject: [PATCH 554/641] Fix compiling issue with regard to Invoker
 definitions in forward_decoder/forward_decoder_split operators

---
 .../hip_fmha/attention_forward_decoder.cpp    |  4 +-
 .../hip_fmha/attention_forward_splitk.cpp     |  4 +-
 .../hip_fmha/ck_attention_forward_decoder.h   | 56 ++++++-----
 .../ck_attention_forward_decoder_splitk.h     | 98 ++++++++++---------
 4 files changed, 83 insertions(+), 79 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 6fe0137b0..41a78f01d 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -149,7 +149,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
             lds_bytes);
 
         auto invoker = device_op_t::Invoker{};
-        (void)invoker.Run(arg, {stream});
+        (void)invoker.Run(&arg, {stream});
       });
 
   return O;
@@ -330,4 +330,4 @@ int main(int argc, char** argv) {
   return 0;
 }
 
-#endif // MAIN
\ No newline at end of file
+#endif // MAIN
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index 0c2740063..bf4d3d793 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -167,7 +167,7 @@ at::Tensor& efficient_attention_forward_decoder_splitk_ck_out_impl(
             lds_bytes);
 
         auto invoker = device_op_t::Invoker{};
-        (void)invoker.Run(arg, {stream});
+        (void)invoker.Run(&arg, {stream});
       });
 
   return O;
@@ -1181,4 +1181,4 @@ int main(int argc, char** argv) {
 #endif // MAIN
 
 #undef AT_DISPATCH_CASE_3
-#undef AT_DISPATCH_SWITCH_3
\ No newline at end of file
+#undef AT_DISPATCH_SWITCH_3
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index 741eda2ef..fcd45dd5f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -434,14 +434,16 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
   struct Invoker : public BaseInvoker {
     using Argument = DeviceOp::Argument;
     float Run(
-        const Argument& arg,
+        const BaseArgument* argp_,
         const StreamConfig& stream_config = StreamConfig{}) {
-      auto threads_per_wavefront = arg.block_dim.x;
+      const Argument* argp = dynamic_cast<const Argument*>(argp_);
+
+      auto threads_per_wavefront = argp->block_dim.x;
 
       auto Q_size_k_alignment_necessary = 0;
 
       for (auto vec_size : {4, 2, 1}) {
-        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+        if (argp->Q_size_k <= vec_size * threads_per_wavefront) {
           Q_size_k_alignment_necessary = vec_size;
         }
       }
@@ -450,7 +452,7 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
         throw std::runtime_error("Unsupported Q_size_k");
       }
 
-      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+      if (argp->Q_size_k % Q_size_k_alignment_necessary) {
         throw std::runtime_error("Unsupported alignment for Q_size_k");
       }
 
@@ -465,29 +467,29 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
                             scalar_t,
                             1>
                       : nullptr,
-          arg.grid_dim,
-          arg.block_dim,
-          arg.lds_bytes,
-          arg.XQ,
-          arg.cache_K,
-          arg.cache_V,
-          arg.O,
-          arg.seq_kv_lens,
-          arg.XQ_stride_b,
-          arg.XQ_stride_m,
-          arg.XQ_stride_g,
-          arg.XQ_stride_h,
-          arg.K_stride_b,
-          arg.K_stride_m,
-          arg.K_stride_g,
-          arg.K_stride_h,
-          arg.Q_size_m,
-          arg.Q_size_g,
-          arg.Q_size_h,
-          arg.Q_size_k,
-          arg.K_size_m,
-          arg.multiquery,
-          arg.qk_scale);
+          argp->grid_dim,
+          argp->block_dim,
+          argp->lds_bytes,
+          argp->XQ,
+          argp->cache_K,
+          argp->cache_V,
+          argp->O,
+          argp->seq_kv_lens,
+          argp->XQ_stride_b,
+          argp->XQ_stride_m,
+          argp->XQ_stride_g,
+          argp->XQ_stride_h,
+          argp->K_stride_b,
+          argp->K_stride_m,
+          argp->K_stride_g,
+          argp->K_stride_h,
+          argp->Q_size_m,
+          argp->Q_size_g,
+          argp->Q_size_h,
+          argp->Q_size_k,
+          argp->K_size_m,
+          argp->multiquery,
+          argp->qk_scale);
     }
   };
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index bb45f3796..df329b20f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -593,13 +593,15 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
   struct Invoker : public BaseInvoker {
     using Argument = DeviceOp::Argument;
     float Run(
-        const Argument& arg,
+        const BaseArgument* argp_,
         const StreamConfig& stream_config = StreamConfig{}) {
-      auto threads_per_wavefront = arg.block_dim.x;
+      const Argument* argp = dynamic_cast<const Argument*>(argp_);
+
+      auto threads_per_wavefront = argp->block_dim.x;
       auto Q_size_k_alignment_necessary = 0;
 
       for (auto vec_size : {4, 2, 1}) {
-        if (arg.Q_size_k <= vec_size * threads_per_wavefront) {
+        if (argp->Q_size_k <= vec_size * threads_per_wavefront) {
           Q_size_k_alignment_necessary = vec_size;
         }
       }
@@ -608,7 +610,7 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
         throw std::runtime_error("Unsupported Q_size_k");
       }
 
-      if (arg.Q_size_k % Q_size_k_alignment_necessary) {
+      if (argp->Q_size_k % Q_size_k_alignment_necessary) {
         throw std::runtime_error("Unsupported alignment for Q_size_k");
       }
 
@@ -639,36 +641,36 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                             KV_M_MAX,
                             compute_t>
                       : nullptr,
-          arg.grid_dim,
-          arg.block_dim,
-          arg.lds_bytes,
-          arg.XQ,
-          arg.cache_K,
-          arg.cache_V,
-          arg.split_O,
-          arg.split_max,
-          arg.split_sumexp,
-          arg.seq_kv_lens,
-          arg.XQ_stride_b,
-          arg.XQ_stride_m,
-          arg.XQ_stride_g,
-          arg.XQ_stride_h,
-          arg.K_stride_b,
-          arg.K_stride_m,
-          arg.K_stride_g,
-          arg.K_stride_h,
-          arg.O_stride_split,
-          arg.Q_size_m,
-          arg.Q_size_g,
-          arg.Q_size_h,
-          arg.Q_size_k,
-          arg.K_size_m,
-          arg.multiquery,
-          arg.qk_scale,
-          arg.split_k);
-
-      const dim3 reduce_gridsize = {arg.grid_dim.x};
-      const dim3 reduce_blocksize = {arg.block_dim.x};
+          argp->grid_dim,
+          argp->block_dim,
+          argp->lds_bytes,
+          argp->XQ,
+          argp->cache_K,
+          argp->cache_V,
+          argp->split_O,
+          argp->split_max,
+          argp->split_sumexp,
+          argp->seq_kv_lens,
+          argp->XQ_stride_b,
+          argp->XQ_stride_m,
+          argp->XQ_stride_g,
+          argp->XQ_stride_h,
+          argp->K_stride_b,
+          argp->K_stride_m,
+          argp->K_stride_g,
+          argp->K_stride_h,
+          argp->O_stride_split,
+          argp->Q_size_m,
+          argp->Q_size_g,
+          argp->Q_size_h,
+          argp->Q_size_k,
+          argp->K_size_m,
+          argp->multiquery,
+          argp->qk_scale,
+          argp->split_k);
+
+      const dim3 reduce_gridsize = {argp->grid_dim.x};
+      const dim3 reduce_blocksize = {argp->block_dim.x};
       constexpr int32_t reduce_lds_bytes = 0;
       float reduce_result = launch_and_time_kernel(
           stream_config,
@@ -688,20 +690,20 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
-          arg.split_O,
-          arg.split_max,
-          arg.split_sumexp,
-          arg.O,
-          arg.Q_size_m,
-          arg.Q_size_g,
-          arg.Q_size_h,
-          arg.Q_size_k,
-          arg.O_stride_split,
-          arg.XQ_stride_b,
-          arg.XQ_stride_m,
-          arg.XQ_stride_g,
-          arg.XQ_stride_h,
-          arg.split_k);
+          argp->split_O,
+          argp->split_max,
+          argp->split_sumexp,
+          argp->O,
+          argp->Q_size_m,
+          argp->Q_size_g,
+          argp->Q_size_h,
+          argp->Q_size_k,
+          argp->O_stride_split,
+          argp->XQ_stride_b,
+          argp->XQ_stride_m,
+          argp->XQ_stride_g,
+          argp->XQ_stride_h,
+          argp->split_k);
       return split_attention_result + reduce_result;
     }
   };

From a7c74756c8da4495e41ae9155ab1c909fa78f653 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 18 Jun 2024 09:51:17 +0000
Subject: [PATCH 555/641] Keep using -Woverloaded-virtual

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 74d5b9cd7..327e1f7df 100644
--- a/setup.py
+++ b/setup.py
@@ -434,7 +434,7 @@ def get_extensions():
                 "-DCK_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
                 "-Werror",
-                ##"-Woverloaded-virtual",
+                "-Woverloaded-virtual",
             ]
             + generator_flag
             + cc_flag,

From b157b490f72b2328f02b4c57353f4543b1d8279b Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 18 Jun 2024 10:23:02 +0000
Subject: [PATCH 556/641] Fix clang-format for headers and cpp files

---
 .../hip_fmha/attention_forward_decoder.cpp    |  6 +--
 .../hip_fmha/attention_forward_splitk.cpp     | 54 +++++++++----------
 .../hip_fmha/ck_attention_forward_decoder.h   | 10 ++--
 .../ck_attention_forward_decoder_splitk.h     | 48 ++++++++---------
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |  5 +-
 5 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
index 41a78f01d..0cabf3f95 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_decoder.cpp
@@ -270,9 +270,9 @@ int main(int argc, char** argv) {
     const int32_t n_heads = std::stoi(args[3]);
     const int32_t n_groups = 1;
     const int32_t multiquery = (args[4] == "mq");
-    const auto dtype = (args[5] == "f32")
-        ? torch::kFloat32
-        : (args[5] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[5] == "f32") ? torch::kFloat32
+        : (args[5] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[6]);
 
     const int32_t dim_per_head = 4 * kThreadsPerWavefront;
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
index bf4d3d793..fd70436a3 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_splitk.cpp
@@ -555,22 +555,22 @@ struct FMHADecoderSplitAttentionDeviceOp : public BaseOperator {
                     kMaxKVSequenceLength,
                     compute_t>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        2,
-                        kLoopUnroll,
-                        kLoopUnrollTail,
-                        kMaxKVSequenceLength,
-                        compute_t>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            1,
-                            kLoopUnroll,
-                            kLoopUnrollTail,
-                            kMaxKVSequenceLength,
-                            compute_t>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    2,
+                    kLoopUnroll,
+                    kLoopUnrollTail,
+                    kMaxKVSequenceLength,
+                    compute_t>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    1,
+                    kLoopUnroll,
+                    kLoopUnrollTail,
+                    kMaxKVSequenceLength,
+                    compute_t>
+              : nullptr,
           arg.grid_dim,
           arg.block_dim,
           arg.lds_bytes,
@@ -728,14 +728,14 @@ struct FMHADecoderSplitReduceDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : O_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                        scalar_t,
-                        2>
-                  : O_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    2>
+              : O_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
@@ -1114,9 +1114,9 @@ int main(int argc, char** argv) {
     const int32_t batch_size = std::stoi(args[1]);
     const int32_t nq_heads = std::stoi(args[2]);
     const int32_t nkv_heads = std::stoi(args[3]);
-    const auto dtype = (args[4] == "f32")
-        ? torch::kFloat32
-        : (args[4] == "f16") ? torch::kFloat16 : torch::kBFloat16;
+    const auto dtype = (args[4] == "f32") ? torch::kFloat32
+        : (args[4] == "f16")              ? torch::kFloat16
+                                          : torch::kBFloat16;
     const int32_t n_wavefronts_per_block = std::stoi(args[5]);
 
     auto [Q, K, V, seq] =
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
index fcd45dd5f..c455f235a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder.h
@@ -461,12 +461,10 @@ struct FMHADecoderSeqlen1DeviceOp : public BaseOperator {
           Q_size_k_alignment_necessary == 4
               ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_ck_kernel<scalar_t, 1>
+              : nullptr,
           argp->grid_dim,
           argp->block_dim,
           argp->lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
index df329b20f..e4d575a58 100644
--- a/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
+++ b/xformers/csrc/attention/hip_fmha/ck_attention_forward_decoder_splitk.h
@@ -625,22 +625,22 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     KV_M_MAX,
                     compute_t>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                        scalar_t,
-                        /* vec_size */ 2,
-                        n_loop_unroll,
-                        n_loop_unroll_tail,
-                        KV_M_MAX,
-                        compute_t>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_ck_kernel<
-                            scalar_t,
-                            /* vec_size */ 1,
-                            n_loop_unroll,
-                            n_loop_unroll_tail,
-                            KV_M_MAX,
-                            compute_t>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    /* vec_size */ 2,
+                    n_loop_unroll,
+                    n_loop_unroll_tail,
+                    KV_M_MAX,
+                    compute_t>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_ck_kernel<
+                    scalar_t,
+                    /* vec_size */ 1,
+                    n_loop_unroll,
+                    n_loop_unroll_tail,
+                    KV_M_MAX,
+                    compute_t>
+              : nullptr,
           argp->grid_dim,
           argp->block_dim,
           argp->lds_bytes,
@@ -679,14 +679,14 @@ struct FMHADecoderSplitKDeviceOp : public BaseOperator {
                     scalar_t,
                     4>
               : Q_size_k_alignment_necessary == 2
-                  ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                        scalar_t,
-                        2>
-                  : Q_size_k_alignment_necessary == 1
-                      ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
-                            scalar_t,
-                            1>
-                      : nullptr,
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    2>
+              : Q_size_k_alignment_necessary == 1
+              ? efficient_attention_forward_decoder_splitk_reduce_ck_kernel<
+                    scalar_t,
+                    1>
+              : nullptr,
           reduce_gridsize,
           reduce_blocksize,
           reduce_lds_bytes,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 802d2faea..2fa305e0a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -50,8 +50,9 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
 
       using FmhaFwdShape_ = FmhaFwdShape<MaxK>;
 
-      constexpr ck_tile::index_t occupancy =
-          (MaxK == 64) ? 3 : (MaxK == 256) ? 1 : 2;
+      constexpr ck_tile::index_t occupancy = (MaxK == 64) ? 3
+          : (MaxK == 256)                                 ? 1
+                                                          : 2;
 
       constexpr auto kBiasEnum = kHasBias
           ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS

From b2fb213edc59453df09d3318083c6f6e353ea5c0 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 18 Jun 2024 03:09:26 +0000
Subject: [PATCH 557/641] Fix format in python scripts

---
 tests/test_mem_eff_attention.py               |  11 +-
 .../attention/hip_fmha/generate_instances.py  | 191 +++++++++---------
 xformers/ops/fmha/ck.py                       |   1 +
 xformers/ops/fmha/dispatch.py                 |   4 +-
 4 files changed, 109 insertions(+), 98 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 16a4b361c..0bb112a6e 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -266,6 +266,7 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
     **_generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(ALL_BW_OPS, max_shapes_per_op=1),
 )
 
+
 def ref_attention_splitk_bmhk(
     q, k, v, attn_bias, scale=None, split_k=None, dtype=None
 ) -> torch.Tensor:
@@ -970,7 +971,7 @@ def test_backward(
         op_fw = fmha.ck.FwOp
         if dtype == torch.bfloat16:
             pytest.skip("CK Fmha backward for bfloat16 currently is not very accurate for some cases!")
-        if grad_out_contiguous == False:
+        if grad_out_contiguous is False:
             pytest.skip("CK Fmha does not support contiguous layout for grad_out!")
         if k % 2 != 0:
             pytest.skip("CK Fmha currently requires the headdim size of query input be an even value!")
@@ -1142,9 +1143,9 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     device = "cuda"
     scale = 3
 
-    dtype=torch.float
+    dtype = torch.float
     if torch.version.hip and op == fmha.ck.FwOp:
-        dtype=torch.float16
+        dtype = torch.float16
 
     query = torch.randn((batch_size, q_len, k_len), device=device, dtype=dtype) * scale
     key = torch.randn((batch_size, kv_len, k_len), device=device, dtype=dtype) * scale
@@ -1294,7 +1295,8 @@ def test_dropout_backward_cutlass(dt, q_len, kv_len, batch_size, k, p):
         dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
     )
 
-cuda_only
+
+@cuda_only
 @pytest.mark.parametrize("p", [0.000001, 0.3, 0.7])
 @pytest.mark.parametrize("k", [16, 64, 128])
 @pytest.mark.parametrize("batch_size", [1, 2])
@@ -1312,6 +1314,7 @@ def test_dropout_backward_ck(dt, q_len, kv_len, batch_size, k, p):
         dtype={"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dt],
     )
 
+
 @cuda_only
 @disable_tf32
 @disable_on_rocm
diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index 9640752fa..de304bf7c 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -18,7 +18,7 @@
  */
 """
 
-FMHA_INFER_INSTANCE_TEMPLATE="""
+FMHA_INFER_INSTANCE_TEMPLATE = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_infer.h\"
 
@@ -30,9 +30,10 @@
     {max_k}>({cap_mode}ForwardParams& param, hipStream_t stream);
 """
 
-FMHA_INFER_INSTANCE_FNAME="fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+FMHA_INFER_INSTANCE_FNAME = "fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_"\
+                            "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
-FMHA_FORWARD_INSTANCE_TEMPLATE="""
+FMHA_FORWARD_INSTANCE_TEMPLATE = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_forward.h\"
 
@@ -44,9 +45,10 @@
     {max_k}>({cap_mode}ForwardParams& param, hipStream_t stream);
 """
 
-FMHA_FORWARD_INSTANCE_FNAME="fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+FMHA_FORWARD_INSTANCE_FNAME = "fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_"\
+                              "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
-FMHA_BACKWARD_INSTANCE_TEMPLATE="""
+FMHA_BACKWARD_INSTANCE_TEMPLATE = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_backward.h\"
 
@@ -59,7 +61,8 @@
     {max_k}>({cap_mode}BackwardParams& param, hipStream_t stream);
 """
 
-FMHA_BACKWARD_INSTANCE_FNAME="fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+FMHA_BACKWARD_INSTANCE_FNAME = "fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_"\
+                               "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
 BOOL_MAP = {
     True : "true",
@@ -72,17 +75,17 @@
 }
 
 BOOL_MAP_BIAS = {
-    True :  "has_bias",
-    False : "no_bias", 
+    True : "has_bias",
+    False : "no_bias",
 }
 
 BOOL_MAP_BIASGRAD = {
-    True : "has_biasgrad", 
+    True : "has_biasgrad",
     False : "no_biasgrad",
 }
 
 BOOL_MAP_DROPOUT = {
-    True : "has_dropout", 
+    True : "has_dropout",
     False : "no_dropout",
 }
 
@@ -94,102 +97,106 @@
 }
 
 TYPE_CTYPE_MAP = {
-   "fp16" : "ck_tile::fp16_t", 
-   "bf16" : "ck_tile::bf16_t",
+    "fp16" : "ck_tile::fp16_t",
+    "bf16" : "ck_tile::bf16_t",
 }
 
 TYPE_FNAME_MAP = {
-   "fp16" : "half", 
-   "bf16" : "bfloat16",
+    "fp16" : "half",
+    "bf16" : "bfloat16",
 }
 
 MODE_NAME_MAP = {
     "batched" : "Batched",
-    "grouped" : "Grouped", 
+    "grouped" : "Grouped",
 }
 
+
 def create_infer_instances(instance_dir: Path) -> None:
-     for mode in ["batched", "grouped"]: 
-         for dtype in ["fp16", "bf16"]: 
-             for has_causalmask in [True, False]: 
-                 for has_bias in [True, False]:
-                     for has_dropout in [True, False]:
-                         for max_k in [32, 64, 128, 256]: 
-                              fname = FMHA_INFER_INSTANCE_FNAME.format(
-                                      mode=mode,
-                                      dtype_str=dtype, 
-                                      has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
-                                      has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
-                                      has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
-                                      max_k_str=INT_MAP_MAX_K[max_k],
-                                      )
-                              infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
-                                      mode=mode,
-                                      dtype_file=TYPE_FNAME_MAP[dtype],
-                                      dtype=TYPE_CTYPE_MAP[dtype], 
-                                      has_causalmask=BOOL_MAP[has_causalmask],
-                                      has_bias=BOOL_MAP[has_bias],
-                                      has_dropout=BOOL_MAP[has_dropout],
-                                      max_k=max_k,
-                                      cap_mode=MODE_NAME_MAP[mode], 
-                                      )
-                              (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance) 
+    for mode in ["batched", "grouped"]:
+        for dtype in ["fp16", "bf16"]:
+            for has_causalmask in [True, False]:
+                for has_bias in [True, False]:
+                    for has_dropout in [True, False]:
+                        for max_k in [32, 64, 128, 256]:
+                            fname = FMHA_INFER_INSTANCE_FNAME.format(
+                                mode=mode,
+                                dtype_str=dtype,
+                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
+                                has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
+                                max_k_str=INT_MAP_MAX_K[max_k],
+                            )
+                            infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
+                                mode=mode,
+                                dtype_file=TYPE_FNAME_MAP[dtype],
+                                dtype=TYPE_CTYPE_MAP[dtype],
+                                has_causalmask=BOOL_MAP[has_causalmask],
+                                has_bias=BOOL_MAP[has_bias],
+                                has_dropout=BOOL_MAP[has_dropout],
+                                max_k=max_k,
+                                cap_mode=MODE_NAME_MAP[mode],
+                            )
+                            (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+
 
 def create_forward_instances(instance_dir: Path) -> None:
-     for mode in ["batched", "grouped"]:
-         for dtype in ["fp16", "bf16"]:
-             for has_causalmask in [True, False]:
-                 for has_bias in [True, False]:
-                     for has_dropout in [True, False]:
-                         for max_k in [32, 64, 128, 256]:
-                              fname = FMHA_FORWARD_INSTANCE_FNAME.format(
-                                      mode=mode,
-                                      dtype_str=dtype,
-                                      has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
-                                      has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
-                                      has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
-                                      max_k_str=INT_MAP_MAX_K[max_k],
-                                      )
-                              infer_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
-                                      mode=mode,
-                                      dtype_file=TYPE_FNAME_MAP[dtype],
-                                      dtype=TYPE_CTYPE_MAP[dtype],
-                                      has_causalmask=BOOL_MAP[has_causalmask],
-                                      has_bias=BOOL_MAP[has_bias],
-                                      has_dropout=BOOL_MAP[has_dropout],
-                                      max_k=max_k,
-                                      cap_mode=MODE_NAME_MAP[mode],
-                                      )
-                              (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+    for mode in ["batched", "grouped"]:
+        for dtype in ["fp16", "bf16"]:
+            for has_causalmask in [True, False]:
+                for has_bias in [True, False]:
+                    for has_dropout in [True, False]:
+                        for max_k in [32, 64, 128, 256]:
+                            fname = FMHA_FORWARD_INSTANCE_FNAME.format(
+                                mode=mode,
+                                dtype_str=dtype,
+                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
+                                has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
+                                max_k_str=INT_MAP_MAX_K[max_k],
+                            )
+                            infer_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
+                                mode=mode,
+                                dtype_file=TYPE_FNAME_MAP[dtype],
+                                dtype=TYPE_CTYPE_MAP[dtype],
+                                has_causalmask=BOOL_MAP[has_causalmask],
+                                has_bias=BOOL_MAP[has_bias],
+                                has_dropout=BOOL_MAP[has_dropout],
+                                max_k=max_k,
+                                cap_mode=MODE_NAME_MAP[mode],
+                            )
+                            (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+
 
 def create_backward_instances(instance_dir: Path) -> None:
-     for mode in ["batched", "grouped"]:
-         for dtype in ["fp16", "bf16"]:
-             for has_causalmask in [True, False]:
-                 for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
-                     for has_dropout in [True, False]:
-                         for max_k in [32, 64, 128]:
-                              fname = FMHA_BACKWARD_INSTANCE_FNAME.format(
-                                      mode=mode,
-                                      dtype_str=dtype,
-                                      has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
-                                      has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
-                                      has_or_no_biasgrad_str=BOOL_MAP_BIASGRAD[has_bias_grad],
-                                      has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
-                                      max_k_str=INT_MAP_MAX_K[max_k],
-                                      )
-                              infer_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
-                                      mode=mode,
-                                      dtype_file=TYPE_FNAME_MAP[dtype],
-                                      dtype=TYPE_CTYPE_MAP[dtype],
-                                      has_causalmask=BOOL_MAP[has_causalmask],
-                                      has_bias=BOOL_MAP[has_bias],
-                                      has_bias_grad=BOOL_MAP[has_bias_grad],
-                                      has_dropout=BOOL_MAP[has_dropout],
-                                      max_k=max_k,
-                                      cap_mode=MODE_NAME_MAP[mode],
-                                      )
-                              (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+    for mode in ["batched", "grouped"]:
+        for dtype in ["fp16", "bf16"]:
+            for has_causalmask in [True, False]:
+                for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
+                    for has_dropout in [True, False]:
+                        for max_k in [32, 64, 128]:
+                            fname = FMHA_BACKWARD_INSTANCE_FNAME.format(
+                                mode=mode,
+                                dtype_str=dtype,
+                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
+                                has_or_no_biasgrad_str=BOOL_MAP_BIASGRAD[has_bias_grad],
+                                has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
+                                max_k_str=INT_MAP_MAX_K[max_k],
+                            )
+                            infer_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
+                                mode=mode,
+                                dtype_file=TYPE_FNAME_MAP[dtype],
+                                dtype=TYPE_CTYPE_MAP[dtype],
+                                has_causalmask=BOOL_MAP[has_causalmask],
+                                has_bias=BOOL_MAP[has_bias],
+                                has_bias_grad=BOOL_MAP[has_bias_grad],
+                                has_dropout=BOOL_MAP[has_dropout],
+                                max_k=max_k,
+                                cap_mode=MODE_NAME_MAP[mode],
+                            )
+                            (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+
 
 if __name__ == "__main__":
     this_dir = os.path.dirname(__file__)
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 79780e093..5d94ff5a2 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -73,6 +73,7 @@ def _get_tensor_bias(
         return attn_bias
     return None
 
+
 def _check_bias_alignment(
     reasons: List[str], attn_bias: Optional[Union[torch.Tensor, AttentionBias]]
 ) -> None:
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 8c5f6967e..f10bdb819 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -139,8 +139,8 @@ def _dispatch_bw(inp: Inputs) -> Type[AttentionBwOpBase]:
         ]
     else:
         priority_list_ops = [
-                ck.BwOp,
-            ]
+            ck.BwOp,
+        ]
 
     if torch.version.cuda and _is_cutlassB_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.BwOp)

From fdf8b8ef3096b6a85f5a38759deddb7b55a7d0d7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 18 Jun 2024 17:50:00 +0000
Subject: [PATCH 558/641] Add noqa: C801 for generate_instances.py

---
 xformers/csrc/attention/hip_fmha/generate_instances.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index de304bf7c..4abd46ec5 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -1,3 +1,4 @@
+# noqa: C801
 # Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the

From 633a16103020aaf014f5fe98c3f87f00ba0b18be Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 19 Jun 2024 08:42:55 +0000
Subject: [PATCH 559/641] Align dispatch_bw with main branch

---
 xformers/ops/fmha/dispatch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index f10bdb819..dfa769b1b 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -127,7 +127,7 @@ def _is_cutlassB_faster_than_flash(inp: Inputs) -> bool:
     return False
 
 
-def _dispatch_bw(inp: Inputs) -> Type[AttentionBwOpBase]:
+def _dispatch_bw(inp: Inputs, is_unpadded_lse: bool = False) -> Type[AttentionBwOpBase]:
     if torch.version.cuda:
         priority_list_ops: List[Type[AttentionBwOpBase]] = [
             flash.BwOp,
@@ -142,6 +142,8 @@ def _dispatch_bw(inp: Inputs) -> Type[AttentionBwOpBase]:
             ck.BwOp,
         ]
 
+    if is_unpadded_lse:
+        priority_list_ops = [op for op in priority_list_ops if op.SUPPORTS_UNPADDED_LSE]
     if torch.version.cuda and _is_cutlassB_faster_than_flash(inp):
         priority_list_ops.remove(cutlass.BwOp)
         priority_list_ops.insert(0, cutlass.BwOp)

From 00cf683aabdb7cb81196fec0af044dc6eb769860 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 19 Jun 2024 22:12:25 +0000
Subject: [PATCH 560/641] Align ops/fmha/common.py with main branch

---
 xformers/ops/fmha/common.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/xformers/ops/fmha/common.py b/xformers/ops/fmha/common.py
index cbcb3c447..734c44d01 100644
--- a/xformers/ops/fmha/common.py
+++ b/xformers/ops/fmha/common.py
@@ -192,13 +192,11 @@ def validate_inputs(self) -> None:
                 and self.value.shape == (B, Mkv, Kv)
             )
         H = self.query.shape[-2]
-        Hkv = self.key.shape[-2]
         if self.query.ndim == 4:  # BMHK
             valid_shapes = (
                 self.query.shape == (B, Mq, H, K)
-                and self.key.shape == (B, Mkv, Hkv, key_embed_dim)
-                and self.value.shape == (B, Mkv, Hkv, Kv)
-                and H % Hkv == 0
+                and self.key.shape == (B, Mkv, H, key_embed_dim)
+                and self.value.shape == (B, Mkv, H, Kv)
             )
         G = self.query.shape[2]
         if self.query.ndim == 5:  # BMNHK

From 252844dd514ace1c96f669fb8303ef35b9d79b26 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 20 Jun 2024 14:53:57 +0000
Subject: [PATCH 561/641] Synchronize the thirty-party/composable_kernel_tiled
 to latest ck_tile commits for better performance

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 37a347e38..e3f44659c 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 37a347e3807198400d6ee1c8401f7c2cbb1d426e
+Subproject commit e3f44659cf77df8c3de15eb14baffd58be6ac550

From 610909edef7c73f7ef6a19adfaaa7164bb6ce728 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 20 Jun 2024 14:56:11 +0000
Subject: [PATCH 562/641] Relax the atol for test_forward and test_dropout due
 to the using of packed fp16_2_fp32 conversion in ck_tile

---
 tests/test_mem_eff_attention.py | 2 +-
 xformers/ops/fmha/ck.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 0bb112a6e..b2bd691ac 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -1175,7 +1175,7 @@ def test_dropout(op, q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
     if dtype is torch.float:
         assert_allclose(out, ref, atol=2e-4), f"{(out - ref).abs().max()}"
     else:
-        assert_allclose(out.float(), ref, atol=2.2e-2), f"{(out - ref).abs().max()}"
+        assert_allclose(out.float(), ref, atol=2.8e-2), f"{(out - ref).abs().max()}"
 
     num_trials = 1000
     p_val_tol = 1e-6
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 5d94ff5a2..39a089553 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -183,7 +183,7 @@ class FwOp(AttentionFwOpBase):
 
     ERROR_ATOL: Mapping[torch.dtype, float] = {
         torch.float: 3e-4,
-        torch.half: 4e-3,
+        torch.half: 6e-3,
         torch.bfloat16: 2.8e-2,
     }
     ERROR_RTOL: Mapping[torch.dtype, float] = {

From 10bf99c85c0d8936af47cefdc58307fae2603493 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 09:43:20 -0700
Subject: [PATCH 563/641] Generate html report for tests run with rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 904234505..06de7d970 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -63,24 +63,20 @@ jobs:
         pip3 install --upgrade pip
         pip3 uninstall -y xformers
         MAX_JOBS=$MAX_JOBS pip3 install -e ./_xformers --verbose
-        pip3 install scipy==1.10
+        pip3 install scipy==1.10 pytest-html
 
         python3 -c "import torch; print(f'PyTorch version {torch.__version__}')"
         python3 -m xformers.info
 
     - name: Run python tests
       run: |
-        pytest -rpfs ./_xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
+        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py 
     
     - name: Archive logs
       uses: actions/upload-artifact@v4
       with:
         name: test results
-        path: test_mem_eff_attention.log
-
-    - name: Process test results
-      run: |
-        echo "Processing test results TBD"
+        path: test_mem_eff_attention.html
 
   clean:
     runs-on: self-hosted

From 16bb10b0a9359aa3ab82410343aa0f4424aa8e6b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 10:31:20 -0700
Subject: [PATCH 564/641] archive test results when tests have failed

---
 .github/workflows/rocm_ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 06de7d970..2bee8b788 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -73,6 +73,7 @@ jobs:
         pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py 
     
     - name: Archive logs
+      if: '!cancelled()'
       uses: actions/upload-artifact@v4
       with:
         name: test results
@@ -83,5 +84,6 @@ jobs:
     needs: [build]
     steps:
       - name: Remove dangling Docker images
+        if: 'always()'
         run: |
           docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi

From 29c782bc5c3f6fd41e8c4ef35abb5bacc0357efb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 11:19:13 -0700
Subject: [PATCH 565/641] Always clean up dangling docker images in rocm_ci

---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 2bee8b788..c840a1708 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -81,9 +81,9 @@ jobs:
 
   clean:
     runs-on: self-hosted
+    if: ${{ always() }}
     needs: [build]
     steps:
       - name: Remove dangling Docker images
-        if: 'always()'
         run: |
           docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi

From 782d5a316ccc2bdcf57ed9bb301e692128e5521a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 17:44:47 -0700
Subject: [PATCH 566/641] Bump python to 3.11 in rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index c840a1708..d8128b370 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -55,18 +55,22 @@ jobs:
         hipcc --version
         rocm-smi
         rocminfo | grep "gfx"
-
-        python3 -VV
-
-    - name: Build XFormers
+        
+    - name: Setup build env
       run: |
-        pip3 install --upgrade pip
-        pip3 uninstall -y xformers
-        MAX_JOBS=$MAX_JOBS pip3 install -e ./_xformers --verbose
-        pip3 install scipy==1.10 pytest-html
+        conda create -n xformers python=3.11
+        conda activate xformers
+        python -VV
+        
+        python -m pip install -U torch --index-url=https://download.pytorch.org/whl/nightly/rocm6.1 
+        python -c "import torch; print(f'PyTorch version {torch.__version__}')"
+        
+        python -m pip install ninja scipy pytest pytest-html
 
-        python3 -c "import torch; print(f'PyTorch version {torch.__version__}')"
-        python3 -m xformers.info
+    - name: Build xformers
+      run: |
+        MAX_JOBS=$MAX_JOBS python setup.py install
+        python -m xformers.info
 
     - name: Run python tests
       run: |

From bd8ca1b4590fe54b71c58cc806e9b7abcf9a3839 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 17:48:01 -0700
Subject: [PATCH 567/641] Disable flash attention tests rocm_ci.yml

Since the op is broken; tbd either make the op work, or disable it on ROCm
---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index d8128b370..91b87d4ca 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -74,7 +74,7 @@ jobs:
 
     - name: Run python tests
       run: |
-        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py 
+        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
     
     - name: Archive logs
       if: '!cancelled()'

From 77beb1978b8cfc1bb45f13535a009c239e742970 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 17:52:13 -0700
Subject: [PATCH 568/641] Try to fix rocm_ci.yml

Init must be called before activation
---
 .github/workflows/rocm_ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 91b87d4ca..8ad8c47b0 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -59,6 +59,7 @@ jobs:
     - name: Setup build env
       run: |
         conda create -n xformers python=3.11
+        conda init
         conda activate xformers
         python -VV
         

From b0ae70734df8f668822cddd367cec63bf311a457 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 18:09:06 -0700
Subject: [PATCH 569/641] try to fix rocm_ci.yml flow by overriding PATH

---
 .github/workflows/rocm_ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 8ad8c47b0..1954b0be2 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -59,8 +59,7 @@ jobs:
     - name: Setup build env
       run: |
         conda create -n xformers python=3.11
-        conda init
-        conda activate xformers
+        export PATH=/opt/conda/envs/xformers/bin:$PATH
         python -VV
         
         python -m pip install -U torch --index-url=https://download.pytorch.org/whl/nightly/rocm6.1 

From d2eeaf097195eb563152998fc4425e251393a108 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 18:53:03 -0700
Subject: [PATCH 570/641] Fix setup.py path in rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 1954b0be2..935b5b76a 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -69,7 +69,7 @@ jobs:
 
     - name: Build xformers
       run: |
-        MAX_JOBS=$MAX_JOBS python setup.py install
+        MAX_JOBS=$MAX_JOBS python ./_xformers/setup.py install
         python -m xformers.info
 
     - name: Run python tests

From a62c93ef7a39841285df4f18a3e904f6d16f65f4 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 18:57:07 -0700
Subject: [PATCH 571/641] cd to xformers dir before running install in
 rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 935b5b76a..0e2bf28d7 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -69,12 +69,13 @@ jobs:
 
     - name: Build xformers
       run: |
-        MAX_JOBS=$MAX_JOBS python ./_xformers/setup.py install
+        cd _xformers
+        MAX_JOBS=$MAX_JOBS python setup.py install
         python -m xformers.info
 
     - name: Run python tests
       run: |
-        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
+        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./tests/test_mem_eff_attention.py -k "not flshatt"
     
     - name: Archive logs
       if: '!cancelled()'

From d3ae25f2d6080cc7008c8318f96bc5834951dde1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 19:24:24 -0700
Subject: [PATCH 572/641] Use pip to install xformers in rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 0e2bf28d7..6ca6a890e 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -69,13 +69,12 @@ jobs:
 
     - name: Build xformers
       run: |
-        cd _xformers
-        MAX_JOBS=$MAX_JOBS python setup.py install
+        pip install ./_xformers --verbose
         python -m xformers.info
 
     - name: Run python tests
       run: |
-        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./tests/test_mem_eff_attention.py -k "not flshatt"
+        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
     
     - name: Archive logs
       if: '!cancelled()'

From d4e6abc9e53b4af5fb40750e111e9e2e624fa7b1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 19:51:27 -0700
Subject: [PATCH 573/641] Possibly fix python version resolution in rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 6ca6a890e..ef1336ce3 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -69,12 +69,16 @@ jobs:
 
     - name: Build xformers
       run: |
-        pip install ./_xformers --verbose
+        export PATH=/opt/conda/envs/xformers/bin:$PATH
+        export MAX_JOBS=64
+        echo PATH = $PATH
+        python -VV
+        python -m pip install ./_xformers --verbose
         python -m xformers.info
 
     - name: Run python tests
       run: |
-        pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
+        python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
     
     - name: Archive logs
       if: '!cancelled()'

From 490b63d0870a0f9e9416d766980906989da43693 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 1 Jul 2024 20:14:24 -0700
Subject: [PATCH 574/641] Set the correct path for pytest in rocm_ci.yml

---
 .github/workflows/rocm_ci.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index ef1336ce3..8b16640fb 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -49,9 +49,6 @@ jobs:
         export ROCM_PATH=/opt/rocm
         echo ROCM_PATH         = $ROCM_PATH
 
-        export MAX_JOBS=64
-        echo MAX_JOBS          = $MAX_JOBS
-
         hipcc --version
         rocm-smi
         rocminfo | grep "gfx"
@@ -70,14 +67,15 @@ jobs:
     - name: Build xformers
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
-        export MAX_JOBS=64
-        echo PATH = $PATH
-        python -VV
+        export MAX_JOBS=144
+        
         python -m pip install ./_xformers --verbose
         python -m xformers.info
 
     - name: Run python tests
       run: |
+        export PATH=/opt/conda/envs/xformers/bin:$PATH
+        
         python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
     
     - name: Archive logs

From addd2f2a85788975645245ead874c417a6ec36c2 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 2 Jul 2024 23:48:56 +0000
Subject: [PATCH 575/641] remove test_reference_splitk as it was moved to a
 different file during the first upstream

remove test_mqa_forward from develop, as the test fails in develop and doesn't run upstream

remove reference attention splitk from the test file; it exists in test_splitk_reference

sync test_mem_eff_attention with upstream
---
 tests/test_mem_eff_attention.py | 406 +++++---------------------------
 1 file changed, 54 insertions(+), 352 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index b2bd691ac..dce31201e 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -7,7 +7,7 @@
 import math
 import random
 from functools import partial
-from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union
+from typing import Any, List, Optional, Sequence, Tuple, Type, TypeVar, Union
 
 import pytest
 import torch
@@ -267,185 +267,6 @@ def _generate_op_device_dtype_biasT_B_Mq_Mkv_H_K_Kv(
 )
 
 
-def ref_attention_splitk_bmhk(
-    q, k, v, attn_bias, scale=None, split_k=None, dtype=None
-) -> torch.Tensor:
-    assert q.ndim == 4
-
-    def T(t):
-        return t.permute((0, 2, 1, 3)).reshape(
-            [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
-        )
-
-    if isinstance(attn_bias, xformers.ops.AttentionBias):
-        attn_bias = attn_bias.materialize(
-            (q.shape[0], q.shape[2], q.shape[1], k.shape[1]),
-            device=q.device,
-            dtype=torch.float32,
-        ).reshape([q.shape[0] * q.shape[2], q.shape[1], k.shape[1]])
-    out = ref_attention_splitk(
-        T(q), T(k), T(v), attn_bias, scale=scale, split_k=split_k, dtype=dtype
-    )
-    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
-    return out.permute((0, 2, 1, 3))
-
-
-def ref_attention_splitk(
-    q, k, v, attn_bias, scale=None, split_k=2, dtype=None
-) -> torch.Tensor:
-    if q.ndim == 5:
-
-        def attn_bias_group(group: int):
-            if isinstance(attn_bias, torch.Tensor):
-                return attn_bias[:, group]
-            if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-                return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                    attn_bias._bias[:, group]
-                )
-            return attn_bias
-
-        return torch.stack(
-            [
-                ref_attention_splitk_bmhk(
-                    q[:, :, g],
-                    k[:, :, g],
-                    v[:, :, g],
-                    attn_bias=attn_bias_group(g),
-                    split_k=split_k,
-                    dtype=dtype,
-                )
-                for g in range(q.shape[2])
-            ],
-            dim=2,
-        )
-
-    if q.ndim == 4:
-        return ref_attention_splitk_bmhk(
-            q, k, v, attn_bias=attn_bias, split_k=split_k, dtype=dtype
-        )
-    assert q.ndim == 3
-    if dtype is None:
-        dtype = torch.float32
-    q = q.to(dtype=dtype)
-    k = k.to(dtype=dtype)
-    v = v.to(dtype=dtype)
-
-    if scale is None:
-        scale = q.shape[-1] ** -0.5
-    assert not q.isnan().any()
-    q = q * scale
-    assert not q.isnan().any()
-
-    if attn_bias is not None:
-        if isinstance(attn_bias, xformers.ops.AttentionBias):
-            # Always create in B,H,Mq,Mk format
-            attn_bias_tensor = attn_bias.materialize(
-                (q.shape[0], 1, q.shape[1], k.shape[1]),
-                device=q.device,
-                dtype=torch.float32,
-            )
-        else:
-            attn_bias_tensor = attn_bias
-        if attn_bias_tensor.ndim == 4:
-            assert q.shape[0] == attn_bias_tensor.shape[0] * attn_bias_tensor.shape[1]
-            attn_bias_tensor = attn_bias_tensor.reshape(
-                [-1, *attn_bias_tensor.shape[2:]]
-            )
-
-    split_size = k.size(-2) // split_k
-    split_config = {"dim": -2, "split_size_or_sections": split_size}
-    k_split = torch.split(k, **split_config)
-    v_split = torch.split(v, **split_config)
-    attn_bias_split = torch.split(
-        attn_bias_tensor, dim=-1, split_size_or_sections=split_size
-    )
-
-    def compute_attention_split(q_whole, k_slice, v_slice, attn_bias_slice):
-        p_slice = q_whole @ k_slice.transpose(-2, -1)
-        p_slice += attn_bias_slice
-        row_max = torch.max(p_slice, dim=-1, keepdim=True).values
-        p_slice_scaled = p_slice - row_max
-        p_slice_scaled[p_slice_scaled.isnan()] = float("-inf")
-        s = torch.exp(p_slice_scaled)
-        row_sumexp = torch.sum(s, dim=-1, keepdim=True)
-        attn_slice = s @ v_slice
-        return {
-            "attn_slice": attn_slice,
-            "row_max": row_max,
-            "row_sumexp": row_sumexp,
-        }
-
-    splits = list(zip(k_split, v_split, attn_bias_split))
-
-    slices = list(map(lambda s: compute_attention_split(q, s[0], s[1], s[2]), splits))
-    out = torch.zeros_like(q)
-
-    # reduce out over split-k slices
-
-    global_max = torch.zeros_like(slices[0]["row_max"]).fill_(float("-inf"))
-    global_sumexp = torch.zeros_like(slices[0]["row_sumexp"])
-
-    for s in slices:
-        local_out = s["attn_slice"]
-        local_max = s["row_max"]
-        local_sumexp = s["row_sumexp"]
-
-        log_alpha = -torch.abs(local_max - global_max)
-        alpha = torch.exp(log_alpha)
-        alpha.nan_to_num_(1.0)
-
-        pick_new = local_max < global_max
-        new_coef = torch.where(pick_new, alpha, 1.0)
-        curr_coef = torch.where(pick_new, 1.0, alpha)
-
-        out = out * curr_coef + local_out * new_coef
-        global_sumexp = global_sumexp * curr_coef + local_sumexp * new_coef
-        global_max = torch.max(local_max, global_max)
-    out /= global_sumexp
-    return out
-
-
-# this interface assumes the tensor is in BMHK, but q and k/v might have different number of heads
-def ref_attention_mqa(q, k, v, attn_bias=None, drop_mask=None, p=0.0, scale=None):
-    assert q.ndim == 4
-
-    B, M, Hq, K = q.shape
-    _, N, Hkv, Kv = v.shape
-    nhead_ratio_qk = Hq // Hkv
-
-    def attn_bias_head(head: int):
-        if isinstance(attn_bias, torch.Tensor):
-            assert attn_bias.ndim == 4
-            _, H, _, _ = attn_bias.shape
-            assert H == Hq
-            bias_bghmn = attn_bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-            return bias_bghmn[:, :, head]
-        if isinstance(attn_bias, fmha.attn_bias.LowerTriangularMaskWithTensorBias):
-            assert attn_bias._bias.ndim == 4
-            _, H, _, _ = attn_bias._bias.shape
-            assert H == Hq
-            bias_bghmn = attn_bias._bias.reshape(B, Hkv, nhead_ratio_qk, M, N)
-            return fmha.attn_bias.LowerTriangularMaskWithTensorBias(
-                bias_bghmn[:, :, head]
-            )
-        return attn_bias
-
-    q_bmghk = q.reshape((B, M, Hkv, nhead_ratio_qk, K))
-
-    return torch.stack(
-        [
-            ref_attention_bmhk(
-                q_bmghk[:, :, :, h],
-                k,
-                v,
-                attn_bias=attn_bias_head(h),
-            )
-            for h in range(q_bmghk.shape[3])
-        ],
-        dim=3,
-    ).reshape((B, M, Hq, Kv))
-
-
 def _rand_partition(r: random.Random, total: int, n: int) -> List[int]:
     # returns list of n nonnegative integers summing to total
     idx = {0, total}
@@ -468,7 +289,7 @@ def get_bias_grad(attn_bias, clear: bool = False) -> Optional[torch.Tensor]:
 
 
 def create_tensors(
-    op: Type[AttentionOpBase],
+    op: Optional[Type[AttentionOpBase]],
     device,
     dtype,
     attn_bias_type,
@@ -482,7 +303,7 @@ def create_tensors(
     attn_bias_requires_grad: bool = False,
     fmt: str = "BMK",
     g: int = 1,
-):
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]:
     torch.manual_seed(B * q_len + kv_len * k + kv)
 
     mask_is_bottom_right = attn_bias_type is not None and issubclass(
@@ -508,7 +329,7 @@ def create_tensors(
         ),
     ):
         page_size_choices = [256, 512]
-        if issubclass(op, fmha.triton_splitk.FwOp):
+        if op is not None and issubclass(op, fmha.triton_splitk.FwOp):
             # TODO: enable small pages for flash attention when that's implemented
             page_size_choices.extend([64, 128])
         page_size = random.choice(page_size_choices)
@@ -573,12 +394,13 @@ def create_tensors(
             ]
 
     inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        pytest.skip(err_msg)
+    if op is not None:
+        reasons = op.not_supported_reasons(inputs)
+        if reasons:
+            err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
+            # Ensure we free memory to avoid OOMs
+            del query, key, value, attn_bias, inputs
+            pytest.skip(err_msg)
     return query, key, value, attn_bias
 
 
@@ -699,92 +521,6 @@ def test_forward(opFW_device_dtype_biasT_B_Mq_Mkv_H_K_Kv, packed, fmt, **kwargs)
     )
 
 
-@rocm_only
-@pytest.mark.parametrize("hdim_k,hdim_v", [(64, 64), (128, 128)])
-@pytest.mark.parametrize("nhead_q,nhead_kv", [(8, 1), (8, 2), (12, 4), (4, 4)])
-@pytest.mark.parametrize("seqlen_q,seqlen_kv", [(100, 128), (128, 100), (200, 1000)])
-@pytest.mark.parametrize("batches", [100, 64, 1])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize(
-    "attn_bias_type", [type(None), torch.Tensor, fmha.attn_bias.LowerTriangularMask]
-)
-@pytest.mark.parametrize("op", [fmha.ck.FwOp])
-def test_mqa_forward(
-    op,
-    attn_bias_type,
-    dtype,
-    batches: int,
-    seqlen_kv: int,
-    seqlen_q: int,
-    nhead_kv: int,
-    nhead_q: int,
-    hdim_v: int,
-    hdim_k: int,
-):
-    B = batches
-    M = seqlen_q
-    N = seqlen_kv
-    Hq = nhead_q
-    Hkv = nhead_kv
-    K = hdim_k
-    Kv = hdim_v
-    nhead_ratio_qk = Hq // Hkv
-
-    device = torch.device("cuda")
-
-    torch.manual_seed(B * M + N * K + Hq * Hkv + Kv)
-
-    scale = 3
-    query = torch.randn((B, M, Hq, K), device=device, dtype=dtype).mul_(scale)
-    key = torch.randn((B, N, Hkv, K), device=device, dtype=dtype).mul_(scale)
-    value = torch.randn((B, N, Hkv, Kv), device=device, dtype=dtype).mul_(scale)
-
-    attn_bias = None
-    if attn_bias_type is not None:
-        attn_bias = create_attn_bias(
-            attn_bias_type,
-            batch_size=B,
-            num_heads=Hq,
-            num_heads_groups=nhead_ratio_qk,
-            q_len=M,
-            kv_len=N,
-            dtype=dtype,
-            device=device,
-            requires_grad=False,
-            fmt="BMHK",
-            op=op,
-        )
-
-    inputs = fmha.Inputs(query=query, key=key, value=value, attn_bias=attn_bias)
-    reasons = op.not_supported_reasons(inputs)
-    if reasons:
-        err_msg = f"{op.NAME}: unsupported ({'/'.join(reasons)})"
-        # Ensure we free memory to avoid OOMs
-        del query, key, value, attn_bias, inputs
-        assert False, err_msg
-
-    out = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert not out.isnan().any(), ("Output has NaNs", attn_bias)
-    out2 = xformers.ops.memory_efficient_attention_forward(
-        query, key, value, attn_bias, op=op
-    )
-    assert torch.allclose(out, out2, atol=0.0, rtol=0.0), (
-        "Non-deterministic behavior",
-        attn_bias,
-    )
-
-    ref = ref_attention_mqa(query, key, value, attn_bias)
-    assert out.shape == ref.shape, out.shape
-    assert_allclose(
-        out.float(),
-        ref,
-        atol=op.ERROR_ATOL[dtype],
-        rtol=op.ERROR_RTOL.get(dtype, 1e-5),
-    )
-
-
 @cuda_only
 @pytest.mark.parametrize("k_len", [5, 6, 32])
 @pytest.mark.parametrize("batch_size", [1, 4])
@@ -970,11 +706,15 @@ def test_backward(
     if op_bw == fmha.ck.BwOp:
         op_fw = fmha.ck.FwOp
         if dtype == torch.bfloat16:
-            pytest.skip("CK Fmha backward for bfloat16 currently is not very accurate for some cases!")
+            pytest.skip(
+                "CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
+            )
         if grad_out_contiguous is False:
             pytest.skip("CK Fmha does not support contiguous layout for grad_out!")
         if k % 2 != 0:
-            pytest.skip("CK Fmha currently requires the headdim size of query input be an even value!")
+            pytest.skip(
+                "CK Fmha currently requires the headdim size of query input be an even value!"
+            )
 
     qkv = None
 
@@ -1906,11 +1646,11 @@ def _test_to_copy(attn_bias: torch.Tensor) -> None:
         assert attn_bias_fp16.device.type == "cpu", f"{attn_bias_fp16.device}"
         assert attn_bias_fp16.dtype == torch.float16, f"{attn_bias_fp16.dtype}"
 
-    attn_bias = fmha.attn_bias.LowerTriangularMask()
+    attn_bias = fmha.attn_bias.LowerTriangularMask().to("cpu")
     _test_to_copy(attn_bias)
 
     tensor_bias = torch.tensor([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0]])
-    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias)
+    attn_bias = fmha.attn_bias.LowerTriangularMaskWithTensorBias(tensor_bias).to("cpu")
     _test_to_copy(attn_bias)
 
 
@@ -1922,66 +1662,6 @@ def _kv_heads_label(kv_heads: Optional[int]) -> str:
     return f"gqa{kv_heads}"
 
 
-@pytest.mark.parametrize("dtype", ["f32"])
-@pytest.mark.parametrize("kv_heads", [None, 1, 2], ids=_kv_heads_label)
-@pytest.mark.parametrize("n_heads", [16])
-@pytest.mark.parametrize("padding, bsz", [(32, 8), (4096, 1)])
-@pytest.mark.parametrize("split_k", [1, 2, 4])
-@pytest.mark.parametrize("device", ["cpu"])
-def test_splitk_reference(
-    kv_heads: int,
-    n_heads: int,
-    padding: int,
-    bsz: int,
-    dtype: str,
-    device: str,
-    split_k: int,
-):
-    dtype_ = {"f16": torch.float16, "bf16": torch.bfloat16, "f32": torch.float32}[dtype]
-    torch.manual_seed(1)
-    d = 256
-    num_queries = 1
-    if kv_heads is not None and kv_heads > 1:
-        k_shape: Tuple[int, ...] = (1, bsz * padding, kv_heads, n_heads, d)
-        q_shape: Tuple[int, ...] = (
-            1,
-            bsz * num_queries,
-            kv_heads,
-            n_heads,
-            d,
-        )
-    else:
-        k_shape = (1, bsz * padding, n_heads, d)
-        q_shape = (1, bsz * num_queries, n_heads, d)
-
-    k = torch.rand(k_shape, dtype=dtype_, device=device)
-    k_seqlen = torch.randint(1, padding + 1, (bsz,)).tolist()
-    v = torch.rand_like(k)
-    q = torch.rand(q_shape, dtype=dtype_, device=device)
-    causal_diagonal = torch.tensor(  # TODO: make unnecessary
-        [i - 1 for i in k_seqlen], dtype=torch.int32, device=device
-    )
-
-    if kv_heads is not None:
-        k = k[..., :1, :].expand(k_shape)
-        v = v[..., :1, :].expand(k_shape)
-
-    attn_bias = fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-        q_seqlen=[1] * bsz,
-        kv_seqlen=k_seqlen,
-        causal_diagonal=causal_diagonal,
-        kv_padding=padding,
-    )
-    ref_out = ref_attention(q, k, v, attn_bias)
-    splitk_out = ref_attention_splitk(q, k, v, attn_bias, None, split_k=split_k)
-    assert_allclose(
-        ref_out,
-        splitk_out,
-        atol=fmha.ck.FwOp.ERROR_ATOL[dtype_],
-        rtol=fmha.ck.FwOp.ERROR_RTOL[dtype_],
-    )
-
-
 @sm70_or_better_only
 @pytest.mark.parametrize(
     "op",
@@ -3735,26 +3415,45 @@ def _merge_attentions_ref(attn_split, lse_split):
 
 @sm80_or_better_only
 @skip_if_rocm  # rocm doesn't support backward yet
-@pytest.mark.parametrize("bias_t", [None, fmha.attn_bias.LowerTriangularMask])
+@pytest.mark.parametrize(
+    "bias_t",
+    [None, fmha.attn_bias.LowerTriangularMask, fmha.attn_bias.BlockDiagonalMask],
+)
 @pytest.mark.parametrize("create_bias_inside_compiled", [False, True])
-@pytest.mark.parametrize("op", [None, (fmha.flash.FwOp, fmha.flash.BwOp)])
+@pytest.mark.parametrize(
+    "op",
+    [None, (fmha.flash.FwOp, fmha.flash.BwOp), (fmha.cutlass.FwOp, fmha.flash.BwOp)],
+)
 def test_memeff_compile(bias_t, create_bias_inside_compiled: bool, op) -> None:
     torch.manual_seed(0)
-    dtype = torch.float16
+    torch._dynamo.reset_code_caches()  # avoids hitting recompilation limit
     B, M, H, K = 1, 256, 2, 64
-    q, k, v = [
-        3 * torch.randn([B, M, H, K], device="cuda", dtype=dtype) for _ in range(3)
-    ]
+    q, k, v, bias = create_tensors(
+        op if op is None else op[0],
+        "cuda",
+        torch.float16,
+        bias_t,
+        B,
+        M,
+        M,
+        H,
+        K,
+        K,
+        fmt="BMHK",
+    )
     grad = torch.randn_like(q)
-    bias = None
-    if not create_bias_inside_compiled and bias_t is not None:
-        bias = bias_t()
+    if create_bias_inside_compiled:
+        bias = None
+        if bias_t not in [None, fmha.attn_bias.LowerTriangularMask]:
+            pytest.skip("Can't create this mask inside compile")
+    if bias is not None:
+        bias.to(q.device)
     q.requires_grad_(True)
     k.requires_grad_(True)
     v.requires_grad_(True)
 
     def fmha_fn(q, k, v, bias):
-        if bias is None and bias_t is not None:
+        if create_bias_inside_compiled and bias_t is not None:
             bias = bias_t()
         return fmha.memory_efficient_attention(q, k, v, attn_bias=bias, op=op)
 
@@ -3773,10 +3472,13 @@ def fmha_fn(q, k, v, bias):
         out,
         out_ref,
         "out",
-        atol=fmha.flash.FwOp.ERROR_ATOL[dtype],
-        rtol=fmha.flash.FwOp.ERROR_RTOL[dtype],
+        atol=fmha.flash.FwOp.ERROR_ATOL[q.dtype],
+        rtol=fmha.flash.FwOp.ERROR_RTOL[q.dtype],
+    )
+    atol, rtol = (
+        fmha.flash.BwOp.ERROR_ATOL[q.dtype],
+        fmha.flash.BwOp.ERROR_RTOL[q.dtype],
     )
-    atol, rtol = fmha.flash.BwOp.ERROR_ATOL[dtype], fmha.flash.BwOp.ERROR_RTOL[dtype]
     assert_allclose(q.grad, dq_ref, "dq", atol=atol, rtol=rtol)
     assert_allclose(k.grad, dk_ref, "dk", atol=atol, rtol=rtol)
     assert_allclose(v.grad, dv_ref, "dv", atol=atol, rtol=rtol)

From 33810ffb790a7ccc636be90b1400fbec9928affd Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 3 Jul 2024 22:18:58 +0000
Subject: [PATCH 576/641] make sure ck operators have a name to be visible in
 the dispatcher

---
 xformers/ops/fmha/ck.py         | 6 +++---
 xformers/ops/fmha/ck_decoder.py | 4 ++--
 xformers/ops/fmha/ck_splitk.py  | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 39a089553..39989038e 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -11,7 +11,7 @@
 
 import torch
 
-from ..common import get_xformers_operator, register_operator
+from ..common import get_operator, register_operator
 from . import attn_bias
 from .attn_bias import (
     AttentionBias,
@@ -155,7 +155,7 @@ def _custom_mask_type(bias: Optional[Union[torch.Tensor, AttentionBias]]) -> int
 class FwOp(AttentionFwOpBase):
     """xFormers' MHA kernel based on Composable Kernel."""
 
-    OPERATOR = get_xformers_operator("efficient_attention_forward_ck")
+    OPERATOR = get_operator("xformers", "efficient_attention_forward_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
     SUPPORTED_MAX_K = 256
@@ -357,7 +357,7 @@ def operator_flop(
 class BwOp(AttentionBwOpBase):
     __doc__ = FwOp.__doc__
 
-    OPERATOR = get_xformers_operator("efficient_attention_backward_ck")
+    OPERATOR = get_operator("xformers", "efficient_attention_backward_ck")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES
     SUPPORTED_MAX_K = 128
diff --git a/xformers/ops/fmha/ck_decoder.py b/xformers/ops/fmha/ck_decoder.py
index b75c420fd..a5c820bfc 100644
--- a/xformers/ops/fmha/ck_decoder.py
+++ b/xformers/ops/fmha/ck_decoder.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from ..common import get_xformers_operator, register_operator
+from ..common import get_operator, register_operator
 from .attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
 from .common import AttentionFwOpBase, Context, Inputs
 
@@ -19,7 +19,7 @@ class FwOp(AttentionFwOpBase):
     Tested to work on MI250x.
     """
 
-    OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_ck")
+    OPERATOR = get_operator("xformers", "efficient_attention_forward_decoder_ck")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16, torch.float}
     SUPPORTED_MAX_K: int = 256
diff --git a/xformers/ops/fmha/ck_splitk.py b/xformers/ops/fmha/ck_splitk.py
index 6996da6c2..4c7af0794 100644
--- a/xformers/ops/fmha/ck_splitk.py
+++ b/xformers/ops/fmha/ck_splitk.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from xformers.ops.common import get_xformers_operator, register_operator
+from xformers.ops.common import get_operator, register_operator
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
 from xformers.ops.fmha.common import (
     AttentionFwOpBase,
@@ -20,7 +20,7 @@
 @register_operator
 class FwOp(AttentionFwOpBase):
 
-    OPERATOR = get_xformers_operator("efficient_attention_forward_decoder_splitk_ck")
+    OPERATOR = get_operator("xformers", "efficient_attention_forward_decoder_splitk_ck")
     SUPPORTED_DEVICES = {"cuda"}
     SUPPORTED_DTYPES = {
         torch.half,

From f3faa1a4b5343867304ae94e585bfcecdb4831ef Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:25:33 +0000
Subject: [PATCH 577/641] fix sm version checks to happen only on CUDA, not
 ROCm

---
 tests/test_mem_eff_attention.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index dce31201e..0affd0db8 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -37,13 +37,13 @@
 if torch.cuda.is_available():
     compute_capability = torch.cuda.get_device_capability("cuda")
 sm70_or_better_only = pytest.mark.skipif(
-    compute_capability < (7, 0), reason="requires sm70+"
+    torch.version.cuda and compute_capability < (7, 0), reason="requires sm70+"
 )
 sm75_or_better_only = pytest.mark.skipif(
-    compute_capability < (7, 5), reason="requires sm75+"
+    torch.version.cuda and compute_capability < (7, 5), reason="requires sm75+"
 )
 sm80_or_better_only = pytest.mark.skipif(
-    compute_capability < (8, 0), reason="requires sm80+"
+    torch.version.cuda and compute_capability < (8, 0), reason="requires sm80+"
 )
 skip_if_rocm = pytest.mark.skipif(
     torch.version.hip is not None, reason="not supported on ROCm"

From 04e948188c17b744c8f68de29425161dc0d2b25d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:32:07 +0000
Subject: [PATCH 578/641] (2/n) fix sm version checks to happen only on CUDA,
 not ROCm

---
 tests/test_mem_eff_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 0affd0db8..7f511bfac 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -1687,7 +1687,7 @@ def test_decoder(
     # kv_heads = 1: multiquery
     # kv_heads = None: neither MQA nor GQA
     # kv_heads > 1: BMGHK
-    if dtype == "bf16" and compute_capability < (8, 0):
+    if dtype == "bf16" and torch.version.cuda and compute_capability < (8, 0):
         raise pytest.skip("BF16 is only supported on SM80+")
     import triton
 

From bd49f48e4d04cc0f584d9e0f38638761beb6cc73 Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Mon, 15 Jul 2024 02:39:58 +0800
Subject: [PATCH 579/641] Remove _check_large_shapes checking in fmha/ck.py
 (#1067)

---
 xformers/ops/fmha/ck.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 39989038e..be061cf5a 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -102,21 +102,6 @@ def _check_bias_alignment(
             )
 
 
-def _check_large_shapes(reasons: List[str], inp: Inputs) -> None:
-    """CK kernel throws "Memory access fault by GPU node-2" when B * T >= 2**20, might be some index overflow.
-    To reproduce, remove this function and run benchmark_mem_eff_attention with ParlAI model shape (256, 4096, 16, 64).
-    This needs further debugging, for now let's not support such shapes.
-    """
-    b_t_limit = 1024**2
-    q_too_large = inp.query.shape[0] * inp.query.shape[1] >= b_t_limit
-    k_too_large = inp.key.shape[0] * inp.key.shape[1] >= b_t_limit
-    v_too_large = inp.value.shape[0] * inp.value.shape[1] >= b_t_limit
-    if q_too_large or k_too_large or v_too_large:
-        reasons.append(
-            "Input is too large: product of first two dimensions of q/k/v must be < 2**20"
-        )
-
-
 class _CustomMaskType(int, Enum):
     """
     (Matches CustomMaskType in C++.)
@@ -325,7 +310,6 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
         check_lastdim_alignment_stride1(reasons, "query", d.query, matmul_alignment_mn)
         check_lastdim_alignment_stride1(reasons, "value", d.value, matmul_alignment_mn)
         _check_bias_alignment(reasons, d.attn_bias)
-        _check_large_shapes(reasons, d)
         return reasons
 
     @classmethod
@@ -416,7 +400,6 @@ def not_supported_reasons(cls, d: Inputs) -> List[str]:
                     f"(shape: {tuple(attn_bias_tensor.shape)}"
                     f"/ expected: {expected_bias_shape})"
                 )
-        _check_large_shapes(reasons, d)
 
         return reasons
 

From 0d1d1bef2f79d9605d7160445e511d57b5dcba80 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 17 Jul 2024 21:33:17 -0400
Subject: [PATCH 580/641] make xformers install editable to fix cpp extensions
 detection

---
 .github/workflows/rocm_ci.yml | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 8b16640fb..d498bea53 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -64,12 +64,18 @@ jobs:
         
         python -m pip install ninja scipy pytest pytest-html
 
+    - name: Pre-build clean
+      run: |
+        cd _xformers
+        git clean -ffdx
+        cd ..
+
     - name: Build xformers
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
         export MAX_JOBS=144
         
-        python -m pip install ./_xformers --verbose
+        python -m pip install -e ./_xformers --verbose
         python -m xformers.info
 
     - name: Run python tests
@@ -85,6 +91,13 @@ jobs:
         name: test results
         path: test_mem_eff_attention.html
 
+    - name: Post-build clean
+      if: '!cancelled()'
+      run: |
+        cd _xformers
+        git clean -ffdx
+        cd ..
+
   clean:
     runs-on: self-hosted
     if: ${{ always() }}

From 9390d6a80f570c891377d5f3c43464fc314849ed Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Jul 2024 09:13:27 +0000
Subject: [PATCH 581/641] Update to using the improved fmha-bwd (compiling
 passed)

---
 .../attention_backward_generic_ck_tiled.cpp   |  25 +++-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 102 ++++++++++++--
 .../hip_fmha/ck_tiled_fmha_batched_forward.h  |   6 +-
 .../hip_fmha/ck_tiled_fmha_batched_infer.h    |   7 +-
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h      | 124 ++++++++++--------
 .../hip_fmha/ck_tiled_fmha_fwd_setting.h      |  17 +++
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 101 ++++++++++++--
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h  |   6 +-
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h    |   7 +-
 .../attention/hip_fmha/ck_tiled_fmha_params.h |   4 +
 .../hip_fmha/ck_tiled_rand_uniform_kernel.h   |  15 ++-
 11 files changed, 313 insertions(+), 101 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index c9494060b..e02a21588 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -143,7 +143,6 @@ efficient_attention_backward_ck(
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
-    grad_q.fill_(0);
   } else if (
       key.size(3) == value.size(3) &&
       key.storage().is_alias_of(value.storage())) {
@@ -157,14 +156,22 @@ efficient_attention_backward_ck(
     grad_v = chunk.select(2, 1);
 
     grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
-    grad_q.fill_(0);
   } else {
     grad_q = at::empty_strided(query.sizes(), query.strides(), query.options());
     grad_k = at::empty_strided(key.sizes(), key.strides(), key.options());
     grad_v = at::empty_strided(value.sizes(), value.strides(), value.options());
-    grad_q.fill_(0);
   }
 
+  at::Tensor grad_q_f32;
+
+  if (query.scalar_type() == at::ScalarType::BFloat16 ||
+      query.scalar_type() == at::ScalarType::Half) {
+    grad_q_f32 = at::empty_like(grad_q);
+    grad_q_f32.fill_(0);
+  } else {
+    grad_q.fill_(0);
+  };
+
   // CK-FlashAttn requires q/k/v to have same shapes with dQ/dK/dV respectively
   TORCH_CHECK(query.sizes() == grad_q.sizes());
   TORCH_CHECK(query.strides() == grad_q.strides());
@@ -229,6 +236,12 @@ efficient_attention_backward_ck(
     p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
     p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
 
+    if (query.scalar_type() == at::ScalarType::BFloat16 ||
+        query.scalar_type() == at::ScalarType::Half)
+      p.grad_q_f32_ptr = grad_q_f32.data_ptr();
+    else
+      p.grad_q_f32_ptr = nullptr;
+
     p.q_strides = {
         static_cast<int>(query.stride(0)),
         static_cast<int>(query.stride(1)),
@@ -480,6 +493,12 @@ efficient_attention_backward_ck(
     p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
     p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
     p.grad_bias_ptr = bias_requires_grad ? grad_bias.data_ptr() : nullptr;
+
+    if (query.scalar_type() == at::ScalarType::BFloat16 ||
+        query.scalar_type() == at::ScalarType::Half)
+      p.grad_q_f32_ptr = grad_q_f32.data_ptr();
+    else
+      p.grad_q_f32_ptr = nullptr;
   };
 
   auto inDataType = query.scalar_type();
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 4a535aa5a..ed1fd8aaa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -23,6 +23,9 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct batched_backward_causalmask_bias_dropout_dispatch {
+  using FmhaBlockDropout =
+      typename FmhaBwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
+
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaBwdPipelineProblemTemp = ck_tile::BlockFmhaBwdPipelineProblem<
       typename FmhaBwdTypeConfig<ScalarType>::QDataType,
@@ -42,12 +45,18 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
       typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
       FmhaBwdShape<MaxK>,
       false, // kIsGroupMode
+      false, // kIsDeterministic
       FmhaMask,
+      FmhaBlockDropout,
       FmhaTraits>;
 
+  static constexpr bool NeedConvertGradQ = !std::is_same<
+      ScalarType,
+      typename FmhaBwdTypeConfig<ScalarType>::QGradDataType>::value;
+
   static void Run(BatchedBackwardParams& param, hipStream_t stream) {
     {
-      constexpr ck_tile::index_t kBlockSize = 256;
+      constexpr ck_tile::index_t kBlockSize = 64;
 
       const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
       const bool pad_headdim_v =
@@ -76,9 +85,8 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
                 typename ck_tile::BlockFmhaBwdOGradDotO<
                     FmhaBwdOGradDotOPipelineProblem>;
 
-            using FmhaBwdOGradDotOKernel_ = ck_tile::FmhaBwdOGradDotOKernel<
-                ck_tile::FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
-                FmhaBwdOGradDotOPipeline>;
+            using FmhaBwdOGradDotOKernel_ =
+                ck_tile::FmhaBwdOGradDotOKernel<FmhaBwdOGradDotOPipeline>;
 
             RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
           });
@@ -93,10 +101,6 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
 
         using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
-        using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
-        using FmhaBwdTilePartitioner_ =
-            ck_tile::FmhaBwdTilePartitioner<FmhaBwdShape_>;
-
         constexpr auto kBiasEnum = kHasBias
             ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
             : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
@@ -104,8 +108,10 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         constexpr bool kPadSeqLenQ = true;
         constexpr bool kPadSeqLenK = true;
 
-        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kQKHeaddim == 0);
-        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kVHeaddim == 0);
+        const bool pad_headdim_q =
+            !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+        const bool pad_headdim_v =
+            !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
 
         // usually headdim_q and headdim_v are same, consider them together
         // to determine whether to do padding saving some compiling time
@@ -120,7 +126,6 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
               kBiasEnum,
               kHasBiasGrad,
               false, // kStoreLSE
-              kHasDropout,
               false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
@@ -149,7 +154,6 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
                   kPadHeadDim>>;
 
           using FmhaBwdDQDKDVKernel_ = ck_tile::FmhaBwdDQDKDVKernel<
-              FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
               FmhaBwdKGradEpilogue_,
               FmhaBwdVGradEpilogue_>;
@@ -158,6 +162,47 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         });
       });
     };
+
+    if constexpr (NeedConvertGradQ) {
+      constexpr ck_tile::index_t kBlockSize = 256;
+
+      const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
+      const bool pad_headdim_q =
+          !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+
+      BOOL_SWITCH_2(
+          pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
+            constexpr ck_tile::index_t occupancy = 2;
+
+            using FmhaBwdConvertQGradTraits_ =
+                ck_tile::TileFmhaBwdConvertQGradTraits<
+                    kPadSeqLenQ,
+                    kPadHeadDimQ,
+                    occupancy>;
+
+            using FmhaBwdConvertQGradPipelineProblem =
+                ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
+                    typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+                    kBlockSize,
+                    FmhaBwdShape<MaxK>::kM0,
+                    FmhaBwdShape<MaxK>::kN0,
+                    FmhaBwdShape<MaxK>::kQKHeaddim,
+                    false, // kIsGroupMode
+                    false, // kIsDeterministic
+                    FmhaBwdConvertQGradTraits_>;
+
+            using FmhaBwdConvertQGradPipeline =
+                typename ck_tile::BlockFmhaBwdConvertQGrad<
+                    FmhaBwdConvertQGradPipelineProblem>;
+
+            using FmhaBwdConvertQGradKernel_ =
+                ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
+
+            RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
+                param, stream);
+          });
+    };
   }
 
   template <typename FmhaBwdOGradDotOKernel>
@@ -208,10 +253,10 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           param.grad_out_ptr,
           param.dot_out_ptr,
           nullptr, // rand_val_ptr
-          param.grad_q_ptr,
           param.grad_k_ptr,
           param.grad_v_ptr,
           param.grad_bias_ptr,
+          NeedConvertGradQ ? param.grad_q_f32_ptr : param.grad_q_ptr,
           param.M, // seqlen_q
           param.N, // seqlen_k
           param.K,
@@ -252,12 +297,12 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           param.grad_v_strides[0],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
+          0, // split_stride_dq_acc
           (param.window_size > 0) ? param.window_size - 1
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob, // dropout ratio
-          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
@@ -270,6 +315,35 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
             FmhaBwdDQDKDVKernel{}, kGridSize, kBlockSize, 0, kargs));
   }
+
+  template <typename FmhaBwdConvertQGradKernel>
+  static void RunWithBwdConvertQGradKernel(
+      BatchedBackwardParams& param,
+      hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaBwdConvertQGradKernel::MakeKargs(
+          param.grad_q_f32_ptr,
+          param.grad_q_ptr,
+          param.M, // seqlen_q
+          param.N, // seqlen_k
+          param.K, // headdim of q/k
+          param.q_strides[1],
+          param.q_strides[2],
+          param.q_strides[0],
+          0);
+    }();
+
+    dim3 kGridSize =
+        FmhaBwdConvertQGradKernel::GridSize(param.B, param.Hq, param.M);
+    constexpr dim3 kBlockSize = FmhaBwdConvertQGradKernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu =
+        FmhaBwdConvertQGradKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaBwdConvertQGradKernel{}, kGridSize, kBlockSize, 0, kargs));
+  }
 };
 
 template <
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 20c1b2c3e..1b1a42b5f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -22,6 +22,9 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct batched_forward_causalmask_bias_dropout_dispatch {
+  using FmhaBlockDropout =
+      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
+
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -38,6 +41,7 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       false, // kIsGroupMode
       FmhaMask,
+      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
@@ -88,7 +92,6 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
                 kBiasEnum,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
-                kHasDropout,
                 false, // kDoFp8StaticQuant place-holder
                 occupancy>;
 
@@ -163,7 +166,6 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob, // dropout ratio
-          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 05d654dc3..1501c4cf6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -23,6 +23,9 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct batched_infer_causalmask_bias_dropout_dispatch {
+  using FmhaBlockDropout =
+      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
+
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -39,6 +42,7 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       false, // kIsGroupMode
       FmhaMask,
+      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
@@ -88,7 +92,6 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
                   kBiasEnum,
                   false, // kHasBiasGrad place-holder
                   false, // kStoreLSE
-                  kHasDropout,
                   false, // kDoFp8StaticQuant place-holder
                   occupancy>;
 
@@ -122,7 +125,6 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
               kBiasEnum,
               false, // kHasBiasGrad place-holder
               false, // kStoreLSE
-              kHasDropout,
               false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
@@ -196,7 +198,6 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob, // dropout ratio
-          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 4ef24248a..9cd3c0e45 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -8,6 +8,7 @@
 
 #include <ck_tile/core.hpp>
 #include <ck_tile/ops/fmha.hpp>
+#include <ck_tile/ops/fmha/block/block_dropout.hpp>
 
 template <typename DataType>
 struct FmhaBwdTypeConfig;
@@ -25,7 +26,7 @@ struct FmhaBwdTypeConfig<ck_tile::fp16_t> {
   using DDataType = float;
   using ODataType = ck_tile::fp16_t;
   using OGradDataType = ck_tile::fp16_t;
-  using QGradDataType = ck_tile::fp16_t;
+  using QGradDataType = float;
   using KGradDataType = ck_tile::fp16_t;
   using VGradDataType = ck_tile::fp16_t;
   using BiasGradDataType = ck_tile::fp16_t;
@@ -44,7 +45,7 @@ struct FmhaBwdTypeConfig<ck_tile::bf16_t> {
   using DDataType = float;
   using ODataType = ck_tile::bf16_t;
   using OGradDataType = ck_tile::bf16_t;
-  using QGradDataType = ck_tile::bf16_t;
+  using QGradDataType = float;
   using KGradDataType = ck_tile::bf16_t;
   using VGradDataType = ck_tile::bf16_t;
   using BiasGradDataType = ck_tile::bf16_t;
@@ -55,15 +56,15 @@ struct FmhaBwdBlockTile;
 
 template <>
 struct FmhaBwdBlockTile<32> {
-  using type = ck_tile::sequence<128, 128, 32, 32, 32, 32, 32, 32, 32>;
-  using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
-  using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck_tile::sequence<4, 1, 1>; // default for gemm4
+  using tile_lengths = ck_tile::sequence<64, 64, 32, 64, 32, 64, 64, 32, 32>;
+  using gemm02_warps = ck_tile::sequence<1, 2, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck_tile::sequence<2, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck_tile::sequence<2, 1, 1>; // default for gemm4
 };
 
 template <>
 struct FmhaBwdBlockTile<64> {
-  using type = ck_tile::sequence<64, 128, 32, 32, 32, 32, 32, 64, 64>;
+  using tile_lengths = ck_tile::sequence<64, 128, 64, 64, 64, 64, 64, 64, 64>;
   using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
   using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
   using gemm4_warps = ck_tile::sequence<2, 2, 1>; // default for gemm4
@@ -71,78 +72,89 @@ struct FmhaBwdBlockTile<64> {
 
 template <>
 struct FmhaBwdBlockTile<128> {
-  using type = ck_tile::sequence<64, 128, 32, 32, 32, 32, 32, 128, 128>;
+  using tile_lengths =
+      ck_tile::sequence<32, 128, 128, 32, 128, 32, 32, 128, 128>;
   using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
   using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck_tile::sequence<2, 2, 1>; // default for gemm4
+  using gemm4_warps = ck_tile::sequence<1, 4, 1>; // default for gemm4
 };
 
-using FmhaBwdWarpTile = ck_tile::sequence<32, 32, 16>;
+template <>
+struct FmhaBwdBlockTile<256> {
+  using tile_lengths =
+      ck_tile::sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>;
+  using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck_tile::sequence<1, 4, 1>; // default for gemm4
+};
+
+using FmhaBwdWarpTile1 = ck_tile::sequence<32, 32, 16>;
+using FmhaBwdWarpTile2 = ck_tile::sequence<16, 16, 32>;
+using FmhaBwdWarpTile3 = ck_tile::sequence<16, 16, 16>;
 
 template <ck_tile::index_t MaxK>
 struct FmhaBwdShape;
 
 template <>
 struct FmhaBwdShape<32> : ck_tile::TileFmhaBwdShape<
-                              typename FmhaBwdBlockTile<32>::type,
+                              typename FmhaBwdBlockTile<32>::tile_lengths,
                               typename FmhaBwdBlockTile<32>::gemm02_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<32>::gemm13_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<32>::gemm02_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<32>::gemm13_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<32>::gemm4_warps,
-                              FmhaBwdWarpTile> {};
+                              FmhaBwdWarpTile1> {};
 
 template <>
 struct FmhaBwdShape<64> : ck_tile::TileFmhaBwdShape<
-                              typename FmhaBwdBlockTile<64>::type,
+                              typename FmhaBwdBlockTile<64>::tile_lengths,
                               typename FmhaBwdBlockTile<64>::gemm02_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<64>::gemm13_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<64>::gemm02_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<64>::gemm13_warps,
-                              FmhaBwdWarpTile,
+                              FmhaBwdWarpTile1,
                               typename FmhaBwdBlockTile<64>::gemm4_warps,
-                              FmhaBwdWarpTile> {};
+                              FmhaBwdWarpTile1> {};
 
 template <>
 struct FmhaBwdShape<128> : ck_tile::TileFmhaBwdShape<
-                               typename FmhaBwdBlockTile<128>::type,
+                               typename FmhaBwdBlockTile<128>::tile_lengths,
                                typename FmhaBwdBlockTile<128>::gemm02_warps,
-                               FmhaBwdWarpTile,
+                               FmhaBwdWarpTile1,
                                typename FmhaBwdBlockTile<128>::gemm13_warps,
-                               FmhaBwdWarpTile,
+                               FmhaBwdWarpTile1,
                                typename FmhaBwdBlockTile<128>::gemm02_warps,
-                               FmhaBwdWarpTile,
+                               FmhaBwdWarpTile1,
                                typename FmhaBwdBlockTile<128>::gemm13_warps,
-                               FmhaBwdWarpTile,
+                               FmhaBwdWarpTile1,
                                typename FmhaBwdBlockTile<128>::gemm4_warps,
-                               FmhaBwdWarpTile> {};
-
-template <ck_tile::index_t MaxK>
-struct FmhaBwdPipelineEnumSelector;
+                               FmhaBwdWarpTile1> {};
 
 template <>
-struct FmhaBwdPipelineEnumSelector<32> {
-  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
-      ck_tile::BlockFmhaBwdPipelineEnum::QSKSVROGradS;
-};
+struct FmhaBwdShape<256> : ck_tile::TileFmhaBwdShape<
+                               typename FmhaBwdBlockTile<256>::tile_lengths,
+                               typename FmhaBwdBlockTile<256>::gemm02_warps,
+                               FmhaBwdWarpTile2,
+                               typename FmhaBwdBlockTile<256>::gemm13_warps,
+                               FmhaBwdWarpTile3,
+                               typename FmhaBwdBlockTile<256>::gemm02_warps,
+                               FmhaBwdWarpTile2,
+                               typename FmhaBwdBlockTile<256>::gemm13_warps,
+                               FmhaBwdWarpTile3,
+                               typename FmhaBwdBlockTile<256>::gemm4_warps,
+                               FmhaBwdWarpTile2> {};
 
-template <>
-struct FmhaBwdPipelineEnumSelector<64> {
-  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
-      ck_tile::BlockFmhaBwdPipelineEnum::KSKTSVR;
-};
-
-template <>
-struct FmhaBwdPipelineEnumSelector<128> {
+template <ck_tile::index_t MaxK>
+struct FmhaBwdPipelineEnumSelector {
   static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
-      ck_tile::BlockFmhaBwdPipelineEnum::KSVR;
+      ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR;
 };
 
 template <ck_tile::BlockFmhaBwdPipelineEnum value, typename problem>
@@ -150,19 +162,23 @@ struct FmhaBwdPipelineMaker;
 
 template <typename problem>
 struct FmhaBwdPipelineMaker<
-    ck_tile::BlockFmhaBwdPipelineEnum::QSKSVROGradS,
+    ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR,
     problem> {
-  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineQSKSVROGradS<problem>;
+  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR<problem>;
 };
 
-template <typename problem>
-struct FmhaBwdPipelineMaker<
-    ck_tile::BlockFmhaBwdPipelineEnum::KSKTSVR,
-    problem> {
-  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKSKTSVR<problem>;
+template <bool kHasDropout, ck_tile::index_t MaxK>
+struct FmhaBwdBlockDropoutMaker;
+
+template <ck_tile::index_t MaxK>
+struct FmhaBwdBlockDropoutMaker<false, MaxK> {
+  using dropout = ck_tile::BlockDropout<false, true, false>;
 };
 
-template <typename problem>
-struct FmhaBwdPipelineMaker<ck_tile::BlockFmhaBwdPipelineEnum::KSVR, problem> {
-  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKSVR<problem>;
+template <ck_tile::index_t MaxK>
+struct FmhaBwdBlockDropoutMaker<true, MaxK> {
+  using FmhaBwdShapeType = FmhaBwdShape<MaxK>;
+  static constexpr bool IsWG32 =
+      (FmhaBwdShapeType::Gemm0WarpTile::at(ck_tile::number<0>{}) == 32);
+  using dropout = ck_tile::BlockDropout<true, IsWG32, false>;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
index 662703b7e..4f3a18e26 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
@@ -8,6 +8,7 @@
 
 #include <ck_tile/core.hpp>
 #include <ck_tile/ops/fmha.hpp>
+#include <ck_tile/ops/fmha/block/block_dropout.hpp>
 
 template <typename DataType>
 struct FmhaFwdTypeConfig;
@@ -117,3 +118,19 @@ struct FmhaFwdShape<256> : ck_tile::TileFmhaShape<
                                typename FmhaFwdBlockTile<256>::gemm1_warps,
                                FmhaFwdWarpTile,
                                IsVLayoutRowMajor> {};
+
+template <bool kHasDropout, ck_tile::index_t MaxK>
+struct FmhaFwdBlockDropoutMaker;
+
+template <ck_tile::index_t MaxK>
+struct FmhaFwdBlockDropoutMaker<false, MaxK> {
+  using dropout = ck_tile::BlockDropout<false, true, false>;
+};
+
+template <ck_tile::index_t MaxK>
+struct FmhaFwdBlockDropoutMaker<true, MaxK> {
+  using FmhaFwdShapeType = FmhaFwdShape<MaxK>;
+  static constexpr bool IsWG32 =
+      (FmhaFwdShapeType::Gemm0WarpTile::at(ck_tile::number<0>{}) == 32);
+  using dropout = ck_tile::BlockDropout<true, IsWG32, false>;
+};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index b5038fdfe..3e8fb35b8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -23,6 +23,9 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct grouped_backward_causalmask_bias_dropout_dispatch {
+  using FmhaBlockDropout =
+      typename FmhaBwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
+
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaBwdPipelineProblemTemp = ck_tile::BlockFmhaBwdPipelineProblem<
       typename FmhaBwdTypeConfig<ScalarType>::QDataType,
@@ -42,12 +45,18 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       typename FmhaBwdTypeConfig<ScalarType>::BiasGradDataType,
       FmhaBwdShape<MaxK>,
       true, // kIsGroupMode
+      false, // non-deterministic
       FmhaMask,
+      FmhaBlockDropout,
       FmhaTraits>;
 
+  static constexpr bool NeedConvertGradQ = !std::is_same<
+      ScalarType,
+      typename FmhaBwdTypeConfig<ScalarType>::QGradDataType>::value;
+
   static void Run(GroupedBackwardParams& param, hipStream_t stream) {
     {
-      constexpr ck_tile::index_t kBlockSize = 256;
+      constexpr ck_tile::index_t kBlockSize = 64;
       bool pad_seqlen_q = !(param.M % kBlockSize == 0);
       bool pad_headdim_v = !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
 
@@ -74,9 +83,8 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
                 typename ck_tile::BlockFmhaBwdOGradDotO<
                     FmhaBwdOGradDotOPipelineProblem>;
 
-            using FmhaBwdOGradDotOKernel_ = ck_tile::FmhaBwdOGradDotOKernel<
-                ck_tile::FmhaBwdOGradDotOTilePartitioner<kBlockSize>,
-                FmhaBwdOGradDotOPipeline_>;
+            using FmhaBwdOGradDotOKernel_ =
+                ck_tile::FmhaBwdOGradDotOKernel<FmhaBwdOGradDotOPipeline_>;
 
             RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
           });
@@ -92,10 +100,6 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
 
         using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
 
-        using FmhaBwdShape_ = FmhaBwdShape<MaxK>;
-        using FmhaBwdTilePartitioner_ =
-            ck_tile::FmhaBwdTilePartitioner<FmhaBwdShape_>;
-
         constexpr auto kBiasEnum = kHasBias
             ? ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS
             : ck_tile::BlockAttentionBiasEnum::NO_BIAS;
@@ -103,8 +107,10 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
         constexpr bool kPadSeqLenQ = true;
         constexpr bool kPadSeqLenK = true;
 
-        const bool pad_headdim_q = !(param.K % FmhaBwdShape_::kQKHeaddim == 0);
-        const bool pad_headdim_v = !(param.Kv % FmhaBwdShape_::kVHeaddim == 0);
+        const bool pad_headdim_q =
+            !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+        const bool pad_headdim_v =
+            !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
 
         // usually headdim_q and headdim_v are same, consider them together
         // to determine whether to do padding saving some compiling time
@@ -119,7 +125,6 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
               kBiasEnum,
               kHasBiasGrad,
               false, // kStoreLSE
-              kHasDropout,
               false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
@@ -148,7 +153,6 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
                   kPadHeadDim>>;
 
           using FmhaBwdDQDKDVKernel_ = ck_tile::FmhaBwdDQDKDVKernel<
-              FmhaBwdTilePartitioner_,
               FmhaBwdPipeline_,
               FmhaBwdKGradEpilogue_,
               FmhaBwdVGradEpilogue_>;
@@ -157,6 +161,47 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
         });
       });
     };
+
+    if constexpr (NeedConvertGradQ) {
+      constexpr ck_tile::index_t kBlockSize = 256;
+
+      const bool pad_seqlen_q = true;
+      const bool pad_headdim_q =
+          !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+
+      BOOL_SWITCH_2(
+          pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
+            constexpr ck_tile::index_t occupancy = 2;
+
+            using FmhaBwdConvertQGradTraits_ =
+                ck_tile::TileFmhaBwdConvertQGradTraits<
+                    kPadSeqLenQ,
+                    kPadHeadDimQ,
+                    occupancy>;
+
+            using FmhaBwdConvertQGradPipelineProblem =
+                ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
+                    typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+                    kBlockSize,
+                    64, // kM0
+                    1, // kN0, no use
+                    FmhaBwdShape<MaxK>::kQKHeaddim,
+                    true, // kIsGroupMode
+                    false, // kIsDeterministic
+                    FmhaBwdConvertQGradTraits_>;
+
+            using FmhaBwdConvertQGradPipeline =
+                typename ck_tile::BlockFmhaBwdConvertQGrad<
+                    FmhaBwdConvertQGradPipelineProblem>;
+
+            using FmhaBwdConvertQGradKernel_ =
+                ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
+
+            RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
+                param, stream);
+          });
+    };
   }
 
   template <typename FmhaBwdOGradDotOKernel>
@@ -205,10 +250,10 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.grad_out_ptr,
           param.dot_out_ptr,
           nullptr, // randval_ptr
-          param.grad_q_ptr,
           param.grad_k_ptr,
           param.grad_v_ptr,
           param.grad_bias_ptr,
+          NeedConvertGradQ ? param.grad_q_f32_ptr : param.grad_q_ptr,
           param.seqstart_q_dev_ptr,
           param.seqstart_k_dev_ptr,
           param.seqlen_k_dev_ptr,
@@ -239,12 +284,12 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           param.lsed_strides[0], // batch_stride_lse
+          0, // split_stride_dq_acc
           (param.window_size > 0) ? param.window_size - 1
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob, // dropout ratio
-          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
@@ -258,6 +303,34 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
         ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
             FmhaBwdDQDKDVKernel{}, kGridSize, kBlockSize, 0, kargs));
   }
+
+  template <typename FmhaBwdConvertQGradKernel>
+  static void RunWithBwdConvertQGradKernel(
+      GroupedBackwardParams& param,
+      hipStream_t stream) {
+    const auto kargs = [&] {
+      return FmhaBwdConvertQGradKernel::MakeKargs(
+          param.grad_q_f32_ptr,
+          param.grad_q_ptr,
+          param.seqstart_q_dev_ptr,
+          param.seqstart_k_dev_ptr,
+          param.K, // headdim of q/k
+          param.q_strides[1],
+          param.q_strides[2],
+          0);
+    }();
+
+    dim3 kGridSize = FmhaBwdConvertQGradKernel::GridSize(
+        param.num_batches, param.Hq, param.max_seqlen_q);
+    constexpr dim3 kBlockSize = FmhaBwdConvertQGradKernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu =
+        FmhaBwdConvertQGradKernel::kBlockPerCu;
+
+    (void)ck_tile::launch_kernel(
+        ck_tile::stream_config{stream, false},
+        ck_tile::make_kernel<kBlockSize.x, kBlockPerCu>(
+            FmhaBwdConvertQGradKernel{}, kGridSize, kBlockSize, 0, kargs));
+  }
 };
 
 template <
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 2fa305e0a..8f0bf95b9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -22,6 +22,9 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct grouped_forward_causalmask_bias_dropout_dispatch {
+  using FmhaBlockDropout =
+      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
+
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -38,6 +41,7 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       true, // kIsGroupMode
       FmhaMask,
+      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
@@ -75,7 +79,6 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
                 kBiasEnum,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
-                kHasDropout,
                 false, // kDoFp8StaticQuant place-holder
                 occupancy>;
 
@@ -158,7 +161,6 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob,
-          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 5197a6cb1..0946bdece 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -23,6 +23,9 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct grouped_infer_causalmask_bias_dropout_dispatch {
+  using FmhaBlockDropout =
+      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
+
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -39,6 +42,7 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       true, // kIsGroupMode
       FmhaMask,
+      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
@@ -76,7 +80,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
                   kBiasEnum,
                   false, // kHasBiasGrad place-holder
                   false, // kStoreLSE
-                  kHasDropout,
                   false, // kDoFp8StaticQuant place-holder
                   occupancy>;
 
@@ -123,7 +126,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
             kBiasEnum,
             false, // kHasBiasGrad place-holder
             false, // kStoreLSE
-            kHasDropout,
             false, // kDoFp8StaticQuant place-holder
             occupancy>;
 
@@ -202,7 +204,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob,
-          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index e97db1e86..4b40730e9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -150,6 +150,8 @@ struct BatchedBackwardParams {
   void* grad_v_ptr;
   void* grad_bias_ptr;
 
+  void* grad_q_f32_ptr;
+
   float dropout_prob;
   int64_t philox_seed;
   int64_t philox_offset;
@@ -211,6 +213,8 @@ struct GroupedBackwardParams {
   void* grad_v_ptr;
   void* grad_bias_ptr;
 
+  void* grad_q_f32_ptr;
+
   float dropout_prob;
   int64_t philox_seed;
   int64_t philox_offset;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
index e930e0b82..4bcb8dd05 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
@@ -5,6 +5,7 @@
 
 #include <ck_tile/core.hpp>
 #include <ck_tile/ops/fmha.hpp>
+#include <ck_tile/ops/fmha/block/block_dropout.hpp>
 #include <ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp>
 #include <ck_tile/ops/gemm/pipeline/block_gemm_pipeline_problem.hpp>
 
@@ -34,6 +35,8 @@ struct FmhaRandUniformKernel {
 
   using BlockGemm = decltype(GetBlockGemm());
 
+  using MyBlockDropout = ck_tile::BlockDropout<true, true, false>;
+
   static constexpr bool kPadSeqLenQ = true;
   static constexpr bool kPadSeqLenK = true;
 
@@ -170,7 +173,7 @@ struct FmhaRandUniformKernel {
   }
 
   __device__ static constexpr ck_tile::index_t GetSmemSize() {
-    return ck_tile::BlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>()
+    return MyBlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>()
         .get_element_space_size();
   }
 
@@ -182,7 +185,7 @@ struct FmhaRandUniformKernel {
       RandValDramBlockWindowTmp& randval_dram_block_window_tmp) const {
     using namespace ck_tile;
 
-    auto randval_dram_window = BlockDropout::MakeRandvalDramWindow<BlockGemm>(
+    auto randval_dram_window = MyBlockDropout::MakeRandvalDramWindow<BlockGemm>(
         randval_dram_block_window_tmp, 0);
 
     const auto num_total_loop =
@@ -201,17 +204,17 @@ struct FmhaRandUniformKernel {
       // randval tile in LDS
       auto randval_lds = make_tensor_view<address_space_enum::lds>(
           reinterpret_cast<uint8_t*>(randval_smem_ptr),
-          BlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>());
+          MyBlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>());
 
       auto randval_lds_window = make_tile_window(
           randval_lds,
-          BlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>()
+          MyBlockDropout::MakeRandValLdsBlockDescriptor<BlockGemm>()
               .get_lengths(),
           {0, 0});
 
       // register distribute
       auto randval_dist_generated = make_static_distributed_tensor<uint8_t>(
-          BlockDropout::MakeRandValTileDistribution<BlockGemm>());
+          MyBlockDropout::MakeRandValTileDistribution<BlockGemm>());
 
       static_assert(randval_dist_generated.kThreadElementSpaceSize == 16);
 
@@ -219,7 +222,7 @@ struct FmhaRandUniformKernel {
           randval_lds_window.get_bottom_tensor_view(),
           randval_lds_window.get_window_lengths(),
           randval_lds_window.get_window_origin(),
-          BlockDropout::MakeRandValLdsShuffleTileDistribution<BlockGemm>());
+          MyBlockDropout::MakeRandValLdsShuffleTileDistribution<BlockGemm>());
 
       const int start_m0_idx =
           randval_dram_window.get_window_origin().at(number<0>{});

From 22fce7e7fe7a82c856e6763ccc59e41f72dcf1e1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 23 Jul 2024 21:17:13 +0000
Subject: [PATCH 582/641] Update to get 80% of the test_backward and
 test_dropout_backward_ck cases passed

---
 tests/test_mem_eff_attention.py                   | 12 ++++--------
 .../attention_backward_generic_ck_tiled.cpp       | 15 ++++++++++-----
 .../hip_fmha/ck_tiled_fmha_batched_backward.h     |  2 +-
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h          |  4 ++--
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h     |  2 +-
 .../attention/hip_fmha/ck_tiled_headdim_switch.h  |  3 +++
 xformers/ops/fmha/ck.py                           |  3 ++-
 7 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index 7f511bfac..d42d4cc22 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -705,16 +705,12 @@ def test_backward(
 
     if op_bw == fmha.ck.BwOp:
         op_fw = fmha.ck.FwOp
-        if dtype == torch.bfloat16:
-            pytest.skip(
-                "CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
-            )
+        ##if dtype == torch.bfloat16:
+        ##    pytest.skip(
+        ##        "CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
+        ##    )
         if grad_out_contiguous is False:
             pytest.skip("CK Fmha does not support contiguous layout for grad_out!")
-        if k % 2 != 0:
-            pytest.skip(
-                "CK Fmha currently requires the headdim size of query input be an even value!"
-            )
 
     qkv = None
 
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index e02a21588..ce7711f50 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -122,10 +122,6 @@ efficient_attention_backward_ck(
   int64_t K = query.size(3);
   int64_t Kv = value.size(3);
 
-  if (K % 2 != 0)
-    throw std::runtime_error(
-        "Currently CK Fmha requires the headdim of query/key be an even value!");
-
   auto opts = query.options();
 
   at::Tensor grad_q, grad_k, grad_v, grad_bias;
@@ -166,7 +162,8 @@ efficient_attention_backward_ck(
 
   if (query.scalar_type() == at::ScalarType::BFloat16 ||
       query.scalar_type() == at::ScalarType::Half) {
-    grad_q_f32 = at::empty_like(grad_q);
+    grad_q_f32 = at::empty_strided(
+        grad_q.sizes(), grad_q.strides(), opts.dtype(at::kFloat));
     grad_q_f32.fill_(0);
   } else {
     grad_q.fill_(0);
@@ -534,6 +531,14 @@ efficient_attention_backward_ck(
     grad_v = tmp_grad_v_view.sum(3);
   }
 
+  /*
+    if (inDataType == at::ScalarType::Half)
+      grad_q = grad_q_f32.to(torch::kFloat16);
+
+    if (inDataType == at::ScalarType::BFloat16)
+      grad_q = grad_q_f32.to(torch::kBFloat16);
+  */
+
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 }
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index ed1fd8aaa..a4ac28eb5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -51,7 +51,7 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
       FmhaTraits>;
 
   static constexpr bool NeedConvertGradQ = !std::is_same<
-      ScalarType,
+      typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
       typename FmhaBwdTypeConfig<ScalarType>::QGradDataType>::value;
 
   static void Run(BatchedBackwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 9cd3c0e45..9aa4b8f0d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -26,7 +26,7 @@ struct FmhaBwdTypeConfig<ck_tile::fp16_t> {
   using DDataType = float;
   using ODataType = ck_tile::fp16_t;
   using OGradDataType = ck_tile::fp16_t;
-  using QGradDataType = float;
+  using QGradDataType = ck_tile::fp16_t;
   using KGradDataType = ck_tile::fp16_t;
   using VGradDataType = ck_tile::fp16_t;
   using BiasGradDataType = ck_tile::fp16_t;
@@ -45,7 +45,7 @@ struct FmhaBwdTypeConfig<ck_tile::bf16_t> {
   using DDataType = float;
   using ODataType = ck_tile::bf16_t;
   using OGradDataType = ck_tile::bf16_t;
-  using QGradDataType = float;
+  using QGradDataType = ck_tile::bf16_t;
   using KGradDataType = ck_tile::bf16_t;
   using VGradDataType = ck_tile::bf16_t;
   using BiasGradDataType = ck_tile::bf16_t;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 3e8fb35b8..3b6fa7581 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -51,7 +51,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       FmhaTraits>;
 
   static constexpr bool NeedConvertGradQ = !std::is_same<
-      ScalarType,
+      typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
       typename FmhaBwdTypeConfig<ScalarType>::QGradDataType>::value;
 
   static void Run(GroupedBackwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
index 18814324b..3e435a646 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
@@ -39,6 +39,9 @@
     } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
       constexpr ck_tile::index_t CONST_NAME = 128;                     \
       __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 256 && HEAD_DIM2 <= 256) {                 \
+      constexpr ck_tile::index_t CONST_NAME = 256;                     \
+      __VA_ARGS__();                                                   \
     } else {                                                           \
       throw std::runtime_error("Head-dim sizes not supported!");       \
     }                                                                  \
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index be061cf5a..2de81623c 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -344,7 +344,7 @@ class BwOp(AttentionBwOpBase):
     OPERATOR = get_operator("xformers", "efficient_attention_backward_ck")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES
-    SUPPORTED_MAX_K = 128
+    SUPPORTED_MAX_K = 256 
     SUPPORTED_ATTN_BIAS_TYPES: Iterable[Any] = (
         type(None),
         torch.Tensor,
@@ -368,6 +368,7 @@ class BwOp(AttentionBwOpBase):
         32,  # 64x64 kernel
         64,
         128,  # 64x128/128x128 kernel
+        256, 
     ]
 
     @classmethod

From 463a47550bf1d312bbc269941911047f7154893d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 25 Jul 2024 10:18:05 +0000
Subject: [PATCH 583/641] Replace the using of ConvertGradQ by using torch
 tensor type converting

---
 .../attention_backward_generic_ck_tiled.cpp   | 10 +--
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 83 ++++++++++---------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 82 +++++++++---------
 3 files changed, 88 insertions(+), 87 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index ce7711f50..671540dcb 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -531,13 +531,11 @@ efficient_attention_backward_ck(
     grad_v = tmp_grad_v_view.sum(3);
   }
 
-  /*
-    if (inDataType == at::ScalarType::Half)
-      grad_q = grad_q_f32.to(torch::kFloat16);
+  if (inDataType == at::ScalarType::Half)
+    grad_q = grad_q_f32.to(torch::kFloat16);
 
-    if (inDataType == at::ScalarType::BFloat16)
-      grad_q = grad_q_f32.to(torch::kBFloat16);
-  */
+  if (inDataType == at::ScalarType::BFloat16)
+    grad_q = grad_q_f32.to(torch::kBFloat16);
 
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index a4ac28eb5..98afe782b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -162,47 +162,48 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         });
       });
     };
-
-    if constexpr (NeedConvertGradQ) {
-      constexpr ck_tile::index_t kBlockSize = 256;
-
-      const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
-      const bool pad_headdim_q =
-          !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
-
-      BOOL_SWITCH_2(
-          pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
-            constexpr ck_tile::index_t occupancy = 2;
-
-            using FmhaBwdConvertQGradTraits_ =
-                ck_tile::TileFmhaBwdConvertQGradTraits<
-                    kPadSeqLenQ,
-                    kPadHeadDimQ,
-                    occupancy>;
-
-            using FmhaBwdConvertQGradPipelineProblem =
-                ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
-                    typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
-                    typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
-                    kBlockSize,
-                    FmhaBwdShape<MaxK>::kM0,
-                    FmhaBwdShape<MaxK>::kN0,
-                    FmhaBwdShape<MaxK>::kQKHeaddim,
-                    false, // kIsGroupMode
-                    false, // kIsDeterministic
-                    FmhaBwdConvertQGradTraits_>;
-
-            using FmhaBwdConvertQGradPipeline =
-                typename ck_tile::BlockFmhaBwdConvertQGrad<
-                    FmhaBwdConvertQGradPipelineProblem>;
-
-            using FmhaBwdConvertQGradKernel_ =
-                ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
-
-            RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
-                param, stream);
-          });
-    };
+    /*
+        if constexpr (NeedConvertGradQ) {
+          constexpr ck_tile::index_t kBlockSize = 256;
+
+          const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
+          const bool pad_headdim_q =
+              !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+
+          BOOL_SWITCH_2(
+              pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
+                constexpr ck_tile::index_t occupancy = 2;
+
+                using FmhaBwdConvertQGradTraits_ =
+                    ck_tile::TileFmhaBwdConvertQGradTraits<
+                        kPadSeqLenQ,
+                        kPadHeadDimQ,
+                        occupancy>;
+
+                using FmhaBwdConvertQGradPipelineProblem =
+                    ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
+                        typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                        typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+                        kBlockSize,
+                        FmhaBwdShape<MaxK>::kM0,
+                        FmhaBwdShape<MaxK>::kN0,
+                        FmhaBwdShape<MaxK>::kQKHeaddim,
+                        false, // kIsGroupMode
+                        false, // kIsDeterministic
+                        FmhaBwdConvertQGradTraits_>;
+
+                using FmhaBwdConvertQGradPipeline =
+                    typename ck_tile::BlockFmhaBwdConvertQGrad<
+                        FmhaBwdConvertQGradPipelineProblem>;
+
+                using FmhaBwdConvertQGradKernel_ =
+                    ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
+
+                RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
+                    param, stream);
+              });
+        };
+    */
   }
 
   template <typename FmhaBwdOGradDotOKernel>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 3b6fa7581..76c5eb66f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -162,46 +162,48 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       });
     };
 
-    if constexpr (NeedConvertGradQ) {
-      constexpr ck_tile::index_t kBlockSize = 256;
-
-      const bool pad_seqlen_q = true;
-      const bool pad_headdim_q =
-          !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
-
-      BOOL_SWITCH_2(
-          pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
-            constexpr ck_tile::index_t occupancy = 2;
-
-            using FmhaBwdConvertQGradTraits_ =
-                ck_tile::TileFmhaBwdConvertQGradTraits<
-                    kPadSeqLenQ,
-                    kPadHeadDimQ,
-                    occupancy>;
-
-            using FmhaBwdConvertQGradPipelineProblem =
-                ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
-                    typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
-                    typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
-                    kBlockSize,
-                    64, // kM0
-                    1, // kN0, no use
-                    FmhaBwdShape<MaxK>::kQKHeaddim,
-                    true, // kIsGroupMode
-                    false, // kIsDeterministic
-                    FmhaBwdConvertQGradTraits_>;
-
-            using FmhaBwdConvertQGradPipeline =
-                typename ck_tile::BlockFmhaBwdConvertQGrad<
-                    FmhaBwdConvertQGradPipelineProblem>;
-
-            using FmhaBwdConvertQGradKernel_ =
-                ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
-
-            RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
-                param, stream);
-          });
-    };
+    /*
+        if constexpr (NeedConvertGradQ) {
+          constexpr ck_tile::index_t kBlockSize = 256;
+
+          const bool pad_seqlen_q = true;
+          const bool pad_headdim_q =
+              !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+
+          BOOL_SWITCH_2(
+              pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
+                constexpr ck_tile::index_t occupancy = 2;
+
+                using FmhaBwdConvertQGradTraits_ =
+                    ck_tile::TileFmhaBwdConvertQGradTraits<
+                        kPadSeqLenQ,
+                        kPadHeadDimQ,
+                        occupancy>;
+
+                using FmhaBwdConvertQGradPipelineProblem =
+                    ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
+                        typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                        typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+                        kBlockSize,
+                        64, // kM0
+                        1, // kN0, no use
+                        FmhaBwdShape<MaxK>::kQKHeaddim,
+                        true, // kIsGroupMode
+                        false, // kIsDeterministic
+                        FmhaBwdConvertQGradTraits_>;
+
+                using FmhaBwdConvertQGradPipeline =
+                    typename ck_tile::BlockFmhaBwdConvertQGrad<
+                        FmhaBwdConvertQGradPipelineProblem>;
+
+                using FmhaBwdConvertQGradKernel_ =
+                    ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
+
+                RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
+                    param, stream);
+              });
+        };
+    */
   }
 
   template <typename FmhaBwdOGradDotOKernel>

From 3427a6f1f3bacb33aabcaaf48965aff873867ea9 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 25 Jul 2024 10:19:20 +0000
Subject: [PATCH 584/641] Change the tile settings for MaxK=32

---
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 9aa4b8f0d..d5d15c05d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -56,10 +56,10 @@ struct FmhaBwdBlockTile;
 
 template <>
 struct FmhaBwdBlockTile<32> {
-  using tile_lengths = ck_tile::sequence<64, 64, 32, 64, 32, 64, 64, 32, 32>;
-  using gemm02_warps = ck_tile::sequence<1, 2, 1>; // default for gemm0/gemm2
-  using gemm13_warps = ck_tile::sequence<2, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck_tile::sequence<2, 1, 1>; // default for gemm4
+  using tile_lengths = ck_tile::sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>;
+  using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
+  using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
+  using gemm4_warps = ck_tile::sequence<2, 2, 1>; // default for gemm4
 };
 
 template <>
@@ -99,15 +99,15 @@ template <>
 struct FmhaBwdShape<32> : ck_tile::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<32>::tile_lengths,
                               typename FmhaBwdBlockTile<32>::gemm02_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile2,
                               typename FmhaBwdBlockTile<32>::gemm13_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile3,
                               typename FmhaBwdBlockTile<32>::gemm02_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile2,
                               typename FmhaBwdBlockTile<32>::gemm13_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile3,
                               typename FmhaBwdBlockTile<32>::gemm4_warps,
-                              FmhaBwdWarpTile1> {};
+                              FmhaBwdWarpTile2> {};
 
 template <>
 struct FmhaBwdShape<64> : ck_tile::TileFmhaBwdShape<

From fbc7c507e89deca1377787947d59949e3d3e3559 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 26 Jul 2024 04:09:32 +0000
Subject: [PATCH 585/641] Fix padding setting bug in grouped_backward

---
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 76c5eb66f..ccf9e6370 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -57,37 +57,35 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
   static void Run(GroupedBackwardParams& param, hipStream_t stream) {
     {
       constexpr ck_tile::index_t kBlockSize = 64;
-      bool pad_seqlen_q = !(param.M % kBlockSize == 0);
       bool pad_headdim_v = !(param.Kv % FmhaBwdShape<MaxK>::kVHeaddim == 0);
 
-      BOOL_SWITCH_2(
-          pad_seqlen_q, kPadSeqLenQ, pad_headdim_v, kPadHeadDimV, [&] {
-            constexpr ck_tile::index_t occupancy = 2;
-
-            using FmhaOGradDotOTraits_ = ck_tile::TileFmhaBwdOGradDotOTraits<
-                kPadSeqLenQ,
-                kPadHeadDimV,
-                occupancy>;
-
-            using FmhaBwdOGradDotOPipelineProblem =
-                ck_tile::BlockFmhaBwdOGradDotOPipelineProblem<
-                    typename FmhaBwdTypeConfig<ScalarType>::ODataType,
-                    typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
-                    typename FmhaBwdTypeConfig<ScalarType>::DDataType,
-                    kBlockSize,
-                    FmhaBwdShape<MaxK>::kVHeaddim,
-                    true, // kIsGroupMode
-                    FmhaOGradDotOTraits_>;
-
-            using FmhaBwdOGradDotOPipeline_ =
-                typename ck_tile::BlockFmhaBwdOGradDotO<
-                    FmhaBwdOGradDotOPipelineProblem>;
-
-            using FmhaBwdOGradDotOKernel_ =
-                ck_tile::FmhaBwdOGradDotOKernel<FmhaBwdOGradDotOPipeline_>;
-
-            RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
-          });
+      constexpr bool kPadSeqLenQ = true;
+
+      BOOL_SWITCH(pad_headdim_v, kPadHeadDimV, [&] {
+        constexpr ck_tile::index_t occupancy = 2;
+
+        using FmhaOGradDotOTraits_ = ck_tile::
+            TileFmhaBwdOGradDotOTraits<kPadSeqLenQ, kPadHeadDimV, occupancy>;
+
+        using FmhaBwdOGradDotOPipelineProblem =
+            ck_tile::BlockFmhaBwdOGradDotOPipelineProblem<
+                typename FmhaBwdTypeConfig<ScalarType>::ODataType,
+                typename FmhaBwdTypeConfig<ScalarType>::OGradDataType,
+                typename FmhaBwdTypeConfig<ScalarType>::DDataType,
+                kBlockSize,
+                FmhaBwdShape<MaxK>::kVHeaddim,
+                true, // kIsGroupMode
+                FmhaOGradDotOTraits_>;
+
+        using FmhaBwdOGradDotOPipeline_ =
+            typename ck_tile::BlockFmhaBwdOGradDotO<
+                FmhaBwdOGradDotOPipelineProblem>;
+
+        using FmhaBwdOGradDotOKernel_ =
+            ck_tile::FmhaBwdOGradDotOKernel<FmhaBwdOGradDotOPipeline_>;
+
+        RunWithBwdOGradDotOKernel<FmhaBwdOGradDotOKernel_>(param, stream);
+      });
     };
 
     {

From 6e08666c488964026efe002662a426adb87ba6a3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 26 Jul 2024 11:33:44 +0000
Subject: [PATCH 586/641] Change -DCK_FMHA_FWD_FAST_EXP2=1 to
 -DCK_TILE_FMHA_FWD_FAST_EXP2=1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 327e1f7df..45fe80824 100644
--- a/setup.py
+++ b/setup.py
@@ -431,7 +431,7 @@ def get_extensions():
                 f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
-                "-DCK_FMHA_FWD_FAST_EXP2=1",
+                "-DCK_TILE_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
                 "-Werror",
                 "-Woverloaded-virtual",

From 94ab5999f9c6e2f2f275989c7bfeeab4b210a5ef Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 26 Jul 2024 11:35:46 +0000
Subject: [PATCH 587/641] Point the composable_kernel_tiled submodule to
 ck_tile/fa_bwd_opt branch

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index b642ad5b9..18adab4b0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = develop 
+	branch = ck_tile/fa_bwd_opt
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index e3f44659c..99ed2c1ae 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit e3f44659cf77df8c3de15eb14baffd58be6ac550
+Subproject commit 99ed2c1ae326a68cec5597bb9ecea11aaaabe80b

From 830697c93fdadf4b6fdd2a83114bc3c2403422a7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 27 Jul 2024 11:37:05 +0000
Subject: [PATCH 588/641] Disable flshattF and flshattB on ROCM

---
 xformers/ops/fmha/flash.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
index 49e708dc2..14a8335ec 100644
--- a/xformers/ops/fmha/flash.py
+++ b/xformers/ops/fmha/flash.py
@@ -607,7 +607,10 @@ class FwOp(AttentionFwOpBase):
         implementation.
     """
 
-    OPERATOR = get_operator("xformers_flash", "flash_fwd")
+    if torch.version.hip:
+        OPERATOR = None
+    else:
+        OPERATOR = get_operator("xformers_flash", "flash_fwd")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
@@ -809,7 +812,10 @@ def operator_flop(
 class BwOp(AttentionBwOpBase):
     __doc__ = FwOp.__doc__
 
-    OPERATOR = get_operator("xformers_flash", "flash_bwd")
+    if torch.version.hip:
+        OPERATOR = None
+    else:
+        OPERATOR = get_operator("xformers_flash", "flash_bwd")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     CUDA_MINIMUM_COMPUTE_CAPABILITY = FwOp.CUDA_MINIMUM_COMPUTE_CAPABILITY
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES

From afd7e022b5a81a90cd6ea169dfc97c14074d23c6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 27 Jul 2024 05:46:02 +0000
Subject: [PATCH 589/641] Add -mllvm and -enable-post-misched=0 compiling
 options for ROCM on setup.py

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 45fe80824..54a261f66 100644
--- a/setup.py
+++ b/setup.py
@@ -435,6 +435,8 @@ def get_extensions():
                 "-fgpu-flush-denormals-to-zero",
                 "-Werror",
                 "-Woverloaded-virtual",
+                "-mllvm",
+                "-enable-post-misched=0"
             ]
             + generator_flag
             + cc_flag,

From e67de4119cfe6cf275aaad3a4543e12e6cd0ae00 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 27 Jul 2024 11:37:05 +0000
Subject: [PATCH 590/641] Disable flshattF and flshattB on ROCM

---
 xformers/ops/fmha/flash.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
index 49e708dc2..14a8335ec 100644
--- a/xformers/ops/fmha/flash.py
+++ b/xformers/ops/fmha/flash.py
@@ -607,7 +607,10 @@ class FwOp(AttentionFwOpBase):
         implementation.
     """
 
-    OPERATOR = get_operator("xformers_flash", "flash_fwd")
+    if torch.version.hip:
+        OPERATOR = None
+    else:
+        OPERATOR = get_operator("xformers_flash", "flash_fwd")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
@@ -809,7 +812,10 @@ def operator_flop(
 class BwOp(AttentionBwOpBase):
     __doc__ = FwOp.__doc__
 
-    OPERATOR = get_operator("xformers_flash", "flash_bwd")
+    if torch.version.hip:
+        OPERATOR = None
+    else:
+        OPERATOR = get_operator("xformers_flash", "flash_bwd")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     CUDA_MINIMUM_COMPUTE_CAPABILITY = FwOp.CUDA_MINIMUM_COMPUTE_CAPABILITY
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES

From d72c2b31273f045598c41171d9824dac0b5f59e5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jul 2024 12:12:45 +0000
Subject: [PATCH 591/641] Update to support separate grad_q_f32_strides do to
 the API change in the fmd_bwd_kernel

---
 .../attention_backward_generic_ck_tiled.cpp   | 24 +++++++++++++++----
 .../hip_fmha/ck_tiled_fmha_batched_backward.h |  9 ++++---
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h      |  4 ++--
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 12 ++++++----
 .../attention/hip_fmha/ck_tiled_fmha_params.h |  6 +++++
 5 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 671540dcb..11aa4fd05 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -159,9 +159,11 @@ efficient_attention_backward_ck(
   }
 
   at::Tensor grad_q_f32;
+  const bool use_grad_q_f32 =
+      (query.scalar_type() == at::ScalarType::BFloat16 ||
+       query.scalar_type() == at::ScalarType::Half);
 
-  if (query.scalar_type() == at::ScalarType::BFloat16 ||
-      query.scalar_type() == at::ScalarType::Half) {
+  if (use_grad_q_f32) {
     grad_q_f32 = at::empty_strided(
         grad_q.sizes(), grad_q.strides(), opts.dtype(at::kFloat));
     grad_q_f32.fill_(0);
@@ -233,8 +235,7 @@ efficient_attention_backward_ck(
     p.grad_k_ptr = is_mqa_gqa ? tmp_grad_k.data_ptr() : grad_k.data_ptr();
     p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
 
-    if (query.scalar_type() == at::ScalarType::BFloat16 ||
-        query.scalar_type() == at::ScalarType::Half)
+    if (use_grad_q_f32)
       p.grad_q_f32_ptr = grad_q_f32.data_ptr();
     else
       p.grad_q_f32_ptr = nullptr;
@@ -270,6 +271,14 @@ efficient_attention_backward_ck(
         static_cast<int>(logsumexp.stride(1)),
         static_cast<int>(logsumexp.stride(2))};
 
+    if (use_grad_q_f32) {
+      p.grad_q_f32_strides = {
+          static_cast<int>(grad_q_f32.stride(0)),
+          static_cast<int>(grad_q_f32.stride(1)),
+          static_cast<int>(grad_q_f32.stride(2)),
+          static_cast<int>(grad_q_f32.stride(3))};
+    }
+
     if (is_mqa_gqa) {
       p.grad_k_strides = {
           static_cast<int>(tmp_grad_k.stride(0)),
@@ -380,6 +389,13 @@ efficient_attention_backward_ck(
         static_cast<int>(logsumexp.stride(1)),
         static_cast<int>(logsumexp.stride(2))};
 
+    if (use_grad_q_f32) {
+      p.grad_q_f32_strides = {
+          static_cast<int>(grad_q_f32.stride(1)),
+          static_cast<int>(grad_q_f32.stride(2)),
+          static_cast<int>(grad_q_f32.stride(3))};
+    }
+
     if (is_mqa_gqa) {
       p.grad_k_strides = {
           static_cast<int>(tmp_grad_k.stride(1)),
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 98afe782b..319ba6d5c 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -265,18 +265,19 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           param.Hq,
           param.Hq / param.Hkv,
           param.scale,
-          param.q_strides[1], // q, k, v, bias, do, dk, dv, dbias seq-dim
-                              // stride
+          param.q_strides[1], // q, k, v, bias, do, dq_f32, dk, dv, dbias
+                              // seq-dim stride
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[2],
           0, // stride_randval
           param.grad_out_strides[1],
+          NeedConvertGradQ ? param.grad_q_f32_strides[1] : param.q_strides[1],
           param.grad_k_strides[1],
           param.grad_v_strides[1],
           param.attn_bias_strides[2], // assume grad_bias has same strides as
                                       // bias
-          param.q_strides[2], // q, k, v, bias, do, lse/dot, dbias
+          param.q_strides[2], // q, k, v, bias, do, lse/dot, dq_f32, dbias
                               // nhead-dim strides
           param.k_strides[2],
           param.v_strides[2],
@@ -284,6 +285,7 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           0, // nhead_stride_randval
           param.grad_out_strides[2],
           param.lsed_strides[1],
+          NeedConvertGradQ ? param.grad_q_f32_strides[2] : param.q_strides[2],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
                                       // bias
           param.q_strides[0], // q, k, v, bias, do, lse/dot, dk, dv, dbias,
@@ -294,6 +296,7 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           0, // batch_stride_randval
           param.grad_out_strides[0],
           param.lsed_strides[0], // lse/dot is in BHM contiguous layout
+          NeedConvertGradQ ? param.grad_q_f32_strides[0] : param.q_strides[0],
           param.grad_k_strides[0],
           param.grad_v_strides[0],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index d5d15c05d..239e09f22 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -172,7 +172,7 @@ struct FmhaBwdBlockDropoutMaker;
 
 template <ck_tile::index_t MaxK>
 struct FmhaBwdBlockDropoutMaker<false, MaxK> {
-  using dropout = ck_tile::BlockDropout<false, true, false>;
+  using dropout = ck_tile::BlockDropoutBwd<false, true, false>;
 };
 
 template <ck_tile::index_t MaxK>
@@ -180,5 +180,5 @@ struct FmhaBwdBlockDropoutMaker<true, MaxK> {
   using FmhaBwdShapeType = FmhaBwdShape<MaxK>;
   static constexpr bool IsWG32 =
       (FmhaBwdShapeType::Gemm0WarpTile::at(ck_tile::number<0>{}) == 32);
-  using dropout = ck_tile::BlockDropout<true, IsWG32, false>;
+  using dropout = ck_tile::BlockDropoutBwd<true, IsWG32, false>;
 };
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index ccf9e6370..e8f30b75e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -262,24 +262,28 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.Hq,
           param.Hq / param.Hkv,
           param.scale,
-          param.q_strides[0], // q, k, v, bias, do, dk, dv, dbias seq-dim
-                              // stride
+          param.q_strides[0], // q, k, v, bias, do, dq_f32, dk, dv, dbias
+                              // seq-dim stride
           param.k_strides[0],
           param.v_strides[0],
           param.attn_bias_strides[1],
           0, // stride_randval
           param.grad_out_strides[0],
+          NeedConvertGradQ ? param.grad_q_f32_strides[0]
+                           : param.grad_q_f32_strides[0],
           param.grad_k_strides[0],
           param.grad_v_strides[0],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
-                                      // bias
-          param.q_strides[1], // q, k, v, bias, do, lse/dot, dbias
+                                      // bias.
+          param.q_strides[1], // q, k, v, bias, do, lse/dot, dq_f32, dbias
                               // nhead-dim strides
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[0],
           0, // nhead_stride_randval
           param.grad_out_strides[1],
+          NeedConvertGradQ ? param.grad_q_f32_strides[1]
+                           : param.grad_q_f32_strides[1],
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 4b40730e9..3d94060dd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -132,6 +132,9 @@ struct BatchedBackwardParams {
   std::array<int, 4> grad_k_strides;
   std::array<int, 4> grad_v_strides;
 
+  // assume grad_q has same strides as q, but grad_q_f32 can be different
+  std::array<int, 4> grad_q_f32_strides;
+
   // BHM mode strides, completely contiguous
   std::array<int, 3> lsed_strides;
 
@@ -195,6 +198,9 @@ struct GroupedBackwardParams {
   std::array<int, 3> grad_k_strides;
   std::array<int, 3> grad_v_strides;
 
+  // assume grad_q has same strides as q, but grad_q_f32 can be different
+  std::array<int, 3> grad_q_f32_strides;
+
   // BHM mode strides, completely contiguous
   std::array<int, 3> lsed_strides;
 

From 5ddff31fda44fd8bd6e3885392ba3b6ca2d2e6de Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jul 2024 12:55:46 +0000
Subject: [PATCH 592/641] Use old method for setting BlockDropout due to the
 revert in fmha_fwd_kernel

---
 third_party/composable_kernel_tiled              |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_forward.h     |  6 ++----
 .../hip_fmha/ck_tiled_fmha_batched_infer.h       |  7 +++----
 .../hip_fmha/ck_tiled_fmha_fwd_setting.h         | 16 ----------------
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h     |  6 ++----
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h       |  7 +++----
 .../hip_fmha/ck_tiled_rand_uniform_kernel.h      |  2 +-
 7 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 99ed2c1ae..ad3e94bba 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 99ed2c1ae326a68cec5597bb9ecea11aaaabe80b
+Subproject commit ad3e94bbaa000e206c1048b0da8e58ce5224b645
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
index 1b1a42b5f..20c1b2c3e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward.h
@@ -22,9 +22,6 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct batched_forward_causalmask_bias_dropout_dispatch {
-  using FmhaBlockDropout =
-      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -41,7 +38,6 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       false, // kIsGroupMode
       FmhaMask,
-      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
@@ -92,6 +88,7 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
                 kBiasEnum,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
+                kHasDropout,
                 false, // kDoFp8StaticQuant place-holder
                 occupancy>;
 
@@ -166,6 +163,7 @@ struct batched_forward_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob, // dropout ratio
+          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 1501c4cf6..05d654dc3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -23,9 +23,6 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct batched_infer_causalmask_bias_dropout_dispatch {
-  using FmhaBlockDropout =
-      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -42,7 +39,6 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       false, // kIsGroupMode
       FmhaMask,
-      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(BatchedForwardParams& param, hipStream_t stream) {
@@ -92,6 +88,7 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
                   kBiasEnum,
                   false, // kHasBiasGrad place-holder
                   false, // kStoreLSE
+                  kHasDropout,
                   false, // kDoFp8StaticQuant place-holder
                   occupancy>;
 
@@ -125,6 +122,7 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
               kBiasEnum,
               false, // kHasBiasGrad place-holder
               false, // kStoreLSE
+              kHasDropout,
               false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
@@ -198,6 +196,7 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob, // dropout ratio
+          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
index 4f3a18e26..ddd91a686 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_fwd_setting.h
@@ -118,19 +118,3 @@ struct FmhaFwdShape<256> : ck_tile::TileFmhaShape<
                                typename FmhaFwdBlockTile<256>::gemm1_warps,
                                FmhaFwdWarpTile,
                                IsVLayoutRowMajor> {};
-
-template <bool kHasDropout, ck_tile::index_t MaxK>
-struct FmhaFwdBlockDropoutMaker;
-
-template <ck_tile::index_t MaxK>
-struct FmhaFwdBlockDropoutMaker<false, MaxK> {
-  using dropout = ck_tile::BlockDropout<false, true, false>;
-};
-
-template <ck_tile::index_t MaxK>
-struct FmhaFwdBlockDropoutMaker<true, MaxK> {
-  using FmhaFwdShapeType = FmhaFwdShape<MaxK>;
-  static constexpr bool IsWG32 =
-      (FmhaFwdShapeType::Gemm0WarpTile::at(ck_tile::number<0>{}) == 32);
-  using dropout = ck_tile::BlockDropout<true, IsWG32, false>;
-};
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 8f0bf95b9..2fa305e0a 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -22,9 +22,6 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct grouped_forward_causalmask_bias_dropout_dispatch {
-  using FmhaBlockDropout =
-      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -41,7 +38,6 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       true, // kIsGroupMode
       FmhaMask,
-      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
@@ -79,6 +75,7 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
                 kBiasEnum,
                 false, // kHasBiasGrad place-holder
                 true, // kStoreLSE
+                kHasDropout,
                 false, // kDoFp8StaticQuant place-holder
                 occupancy>;
 
@@ -161,6 +158,7 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob,
+          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 0946bdece..5197a6cb1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -23,9 +23,6 @@ template <
     bool kHasDropout,
     ck_tile::index_t MaxK>
 struct grouped_infer_causalmask_bias_dropout_dispatch {
-  using FmhaBlockDropout =
-      typename FmhaFwdBlockDropoutMaker<kHasDropout, MaxK>::dropout;
-
   template <typename FmhaTraits, typename FmhaMask>
   using FmhaPipelineProblemTemp = ck_tile::BlockFmhaPipelineProblem<
       typename FmhaFwdTypeConfig<ScalarType>::QDataType,
@@ -42,7 +39,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
       FmhaFwdShape<MaxK>,
       true, // kIsGroupMode
       FmhaMask,
-      FmhaBlockDropout,
       FmhaTraits>;
 
   static void Run(GroupedForwardParams& param, hipStream_t stream) {
@@ -80,6 +76,7 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
                   kBiasEnum,
                   false, // kHasBiasGrad place-holder
                   false, // kStoreLSE
+                  kHasDropout,
                   false, // kDoFp8StaticQuant place-holder
                   occupancy>;
 
@@ -126,6 +123,7 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
             kBiasEnum,
             false, // kHasBiasGrad place-holder
             false, // kStoreLSE
+            kHasDropout,
             false, // kDoFp8StaticQuant place-holder
             occupancy>;
 
@@ -204,6 +202,7 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
           param.custom_mask_type,
           param.dropout_prob,
+          false, // is_store_randval
           {param.philox_seed, param.philox_offset});
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h b/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
index 4bcb8dd05..715d5e4bd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_rand_uniform_kernel.h
@@ -35,7 +35,7 @@ struct FmhaRandUniformKernel {
 
   using BlockGemm = decltype(GetBlockGemm());
 
-  using MyBlockDropout = ck_tile::BlockDropout<true, true, false>;
+  using MyBlockDropout = ck_tile::BlockDropout;
 
   static constexpr bool kPadSeqLenQ = true;
   static constexpr bool kPadSeqLenK = true;

From cf2b6224222528f5b0fc9c932ecd2e224260bee8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jul 2024 13:10:02 +0000
Subject: [PATCH 593/641] Tiny fix in grouped_backward

---
 .../attention/hip_fmha/ck_tiled_fmha_grouped_backward.h     | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index e8f30b75e..586f9e2d0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -269,8 +269,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.attn_bias_strides[1],
           0, // stride_randval
           param.grad_out_strides[0],
-          NeedConvertGradQ ? param.grad_q_f32_strides[0]
-                           : param.grad_q_f32_strides[0],
+          NeedConvertGradQ ? param.grad_q_f32_strides[0] : param.q_strides[0],
           param.grad_k_strides[0],
           param.grad_v_strides[0],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
@@ -282,8 +281,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.attn_bias_strides[0],
           0, // nhead_stride_randval
           param.grad_out_strides[1],
-          NeedConvertGradQ ? param.grad_q_f32_strides[1]
-                           : param.grad_q_f32_strides[1],
+          NeedConvertGradQ ? param.grad_q_f32_strides[1] : param.q_strides[1],
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias

From 112aaedd93988da0663a8fb4e8282047dd6612e7 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jul 2024 13:50:30 +0000
Subject: [PATCH 594/641] Use packed tensor allocation for grad_q_f32

---
 .../attention/hip_fmha/attention_backward_generic_ck_tiled.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 11aa4fd05..d47982602 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -164,8 +164,7 @@ efficient_attention_backward_ck(
        query.scalar_type() == at::ScalarType::Half);
 
   if (use_grad_q_f32) {
-    grad_q_f32 = at::empty_strided(
-        grad_q.sizes(), grad_q.strides(), opts.dtype(at::kFloat));
+    grad_q_f32 = at::empty(grad_q.sizes(), opts.dtype(at::kFloat));
     grad_q_f32.fill_(0);
   } else {
     grad_q.fill_(0);

From dd83c62b711ffa3c5499781f8c173a7d20b2b30f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jul 2024 15:05:36 +0000
Subject: [PATCH 595/641] Update to the ConvertGradQ kernel calling

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h   | 3 +++
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 319ba6d5c..c36a18571 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -332,8 +332,11 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           param.N, // seqlen_k
           param.K, // headdim of q/k
           param.q_strides[1],
+          param.grad_q_f32_strides[1],
           param.q_strides[2],
+          param.grad_q_f32_strides[2],
           param.q_strides[0],
+          param.grad_q_f32_strides[0],
           0);
     }();
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 586f9e2d0..319a9c276 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -318,7 +318,9 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.seqstart_k_dev_ptr,
           param.K, // headdim of q/k
           param.q_strides[1],
+          param.grad_q_f32_strides[1],
           param.q_strides[2],
+          param.grad_q_f32_strides[2],
           0);
     }();
 

From 3e9b99d48346e2d4c0cef3ab0f8388d7a0cb1e6e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sun, 28 Jul 2024 16:06:30 +0000
Subject: [PATCH 596/641] Tiny update

---
 .../attention/hip_fmha/attention_backward_generic_ck_tiled.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index d47982602..e9b53ce81 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -506,8 +506,7 @@ efficient_attention_backward_ck(
     p.grad_v_ptr = is_mqa_gqa ? tmp_grad_v.data_ptr() : grad_v.data_ptr();
     p.grad_bias_ptr = bias_requires_grad ? grad_bias.data_ptr() : nullptr;
 
-    if (query.scalar_type() == at::ScalarType::BFloat16 ||
-        query.scalar_type() == at::ScalarType::Half)
+    if (use_grad_q_f32)
       p.grad_q_f32_ptr = grad_q_f32.data_ptr();
     else
       p.grad_q_f32_ptr = nullptr;

From 019448e5996c749cccb21f9bd4ec31668e47c221 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 29 Jul 2024 15:20:48 +0000
Subject: [PATCH 597/641] Fix the parameter location in grouped_backward

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 319a9c276..39ea20bb8 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -281,8 +281,8 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.attn_bias_strides[0],
           0, // nhead_stride_randval
           param.grad_out_strides[1],
-          NeedConvertGradQ ? param.grad_q_f32_strides[1] : param.q_strides[1],
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
+          NeedConvertGradQ ? param.grad_q_f32_strides[1] : param.q_strides[1],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           param.lsed_strides[0], // batch_stride_lse

From c55966a64e4f32e51b3b22db496a8ec615f38526 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Aug 2024 07:15:51 +0000
Subject: [PATCH 598/641] Adjust headdim128 tile shapes for better performance

---
 .../attention/hip_fmha/ck_tiled_fmha_bwd_setting.h   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 239e09f22..9858c5062 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -73,7 +73,7 @@ struct FmhaBwdBlockTile<64> {
 template <>
 struct FmhaBwdBlockTile<128> {
   using tile_lengths =
-      ck_tile::sequence<32, 128, 128, 32, 128, 32, 32, 128, 128>;
+      ck_tile::sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>;
   using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
   using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
   using gemm4_warps = ck_tile::sequence<1, 4, 1>; // default for gemm4
@@ -127,15 +127,15 @@ template <>
 struct FmhaBwdShape<128> : ck_tile::TileFmhaBwdShape<
                                typename FmhaBwdBlockTile<128>::tile_lengths,
                                typename FmhaBwdBlockTile<128>::gemm02_warps,
-                               FmhaBwdWarpTile1,
+                               FmhaBwdWarpTile2,
                                typename FmhaBwdBlockTile<128>::gemm13_warps,
-                               FmhaBwdWarpTile1,
+                               FmhaBwdWarpTile3,
                                typename FmhaBwdBlockTile<128>::gemm02_warps,
-                               FmhaBwdWarpTile1,
+                               FmhaBwdWarpTile2,
                                typename FmhaBwdBlockTile<128>::gemm13_warps,
-                               FmhaBwdWarpTile1,
+                               FmhaBwdWarpTile3,
                                typename FmhaBwdBlockTile<128>::gemm4_warps,
-                               FmhaBwdWarpTile1> {};
+                               FmhaBwdWarpTile2> {};
 
 template <>
 struct FmhaBwdShape<256> : ck_tile::TileFmhaBwdShape<

From e22829ab19dda93d2d87504f607b5173831c8990 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Aug 2024 07:56:50 +0000
Subject: [PATCH 599/641] Update backward kernel calling due to adding of
 nhead_stride_dk/nhead_stride_dv parameters

---
 third_party/composable_kernel_tiled                         | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_backward.h     | 6 ++++--
 .../attention/hip_fmha/ck_tiled_fmha_grouped_backward.h     | 6 ++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index ad3e94bba..5d2a5a113 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit ad3e94bbaa000e206c1048b0da8e58ce5224b645
+Subproject commit 5d2a5a1131ab8c8a340010f32c8a8f2c3c5566d8
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index c36a18571..6725a4760 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -277,8 +277,8 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           param.grad_v_strides[1],
           param.attn_bias_strides[2], // assume grad_bias has same strides as
                                       // bias
-          param.q_strides[2], // q, k, v, bias, do, lse/dot, dq_f32, dbias
-                              // nhead-dim strides
+          param.q_strides[2], // q, k, v, bias, do, lse/dot, dq_f32, dk, dv,
+                              // dbias nhead-dim strides
           param.k_strides[2],
           param.v_strides[2],
           param.attn_bias_strides[1],
@@ -286,6 +286,8 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
           param.grad_out_strides[2],
           param.lsed_strides[1],
           NeedConvertGradQ ? param.grad_q_f32_strides[2] : param.q_strides[2],
+          param.grad_k_strides[2],
+          param.grad_v_strides[2],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
                                       // bias
           param.q_strides[0], // q, k, v, bias, do, lse/dot, dk, dv, dbias,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 39ea20bb8..5617880cd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -274,8 +274,8 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.grad_v_strides[0],
           param.attn_bias_strides[1], // assume grad_bias has same strides as
                                       // bias.
-          param.q_strides[1], // q, k, v, bias, do, lse/dot, dq_f32, dbias
-                              // nhead-dim strides
+          param.q_strides[1], // q, k, v, bias, do, lse/dot, dq_f32, dk, dv,
+                              // dbias nhead-dim strides
           param.k_strides[1],
           param.v_strides[1],
           param.attn_bias_strides[0],
@@ -283,6 +283,8 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.grad_out_strides[1],
           param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
           NeedConvertGradQ ? param.grad_q_f32_strides[1] : param.q_strides[1],
+          param.grad_k_strides[1],
+          param.grad_v_strides[1],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
           param.lsed_strides[0], // batch_stride_lse

From cae1b77de3b578051d2ba1bfe44094b39df3c95d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Aug 2024 10:08:28 +0000
Subject: [PATCH 600/641] Synchronize with CK to use separate pipeline for
 kPadHeadDim true of false situtation

---
 third_party/composable_kernel_tiled               |  2 +-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h     |  3 ++-
 .../hip_fmha/ck_tiled_fmha_bwd_setting.h          | 15 ++++++++++++++-
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h     |  3 ++-
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 5d2a5a113..25db13392 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 5d2a5a1131ab8c8a340010f32c8a8f2c3c5566d8
+Subproject commit 25db1339265fa020d457e13d8440786d647fcc23
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 6725a4760..4fb5f7086 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -133,7 +133,8 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
               FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
 
           constexpr auto FmhaBwdPipelineEnum_ =
-              FmhaBwdPipelineEnumSelector<MaxK>::value;
+              FmhaBwdPipelineEnumSelector<MaxK, kPadHeadDim, kPadHeadDim>::
+                  value;
 
           using FmhaBwdPipeline_ = typename FmhaBwdPipelineMaker<
               FmhaBwdPipelineEnum_,
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 9858c5062..64f16dbb5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -151,12 +151,18 @@ struct FmhaBwdShape<256> : ck_tile::TileFmhaBwdShape<
                                typename FmhaBwdBlockTile<256>::gemm4_warps,
                                FmhaBwdWarpTile2> {};
 
-template <ck_tile::index_t MaxK>
+template <ck_tile::index_t MaxK, bool kPadHeadDimQK, bool kPadHeadDimV>
 struct FmhaBwdPipelineEnumSelector {
   static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
       ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR;
 };
 
+template <ck_tile::index_t MaxK>
+struct FmhaBwdPipelineEnumSelector<MaxK, false, false> {
+  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
+      ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP;
+};
+
 template <ck_tile::BlockFmhaBwdPipelineEnum value, typename problem>
 struct FmhaBwdPipelineMaker;
 
@@ -167,6 +173,13 @@ struct FmhaBwdPipelineMaker<
   using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR<problem>;
 };
 
+template <typename problem>
+struct FmhaBwdPipelineMaker<
+    ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP,
+    problem> {
+  using pipeline = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<problem>;
+};
+
 template <bool kHasDropout, ck_tile::index_t MaxK>
 struct FmhaBwdBlockDropoutMaker;
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 5617880cd..599bfac68 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -130,7 +130,8 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
               FmhaBwdPipelineProblemTemp<FmhaBwdTraits_, FmhaMask>;
 
           constexpr auto FmhaBwdPipelineEnum_ =
-              FmhaBwdPipelineEnumSelector<MaxK>::value;
+              FmhaBwdPipelineEnumSelector<MaxK, kPadHeadDim, kPadHeadDim>::
+                  value;
 
           using FmhaBwdPipeline_ = typename FmhaBwdPipelineMaker<
               FmhaBwdPipelineEnum_,

From e564f5e1a16f293e99553753b497af32994a0594 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 6 Aug 2024 10:19:08 +0000
Subject: [PATCH 601/641] Use convertDQ kernel

---
 .../attention_backward_generic_ck_tiled.cpp   | 10 ++-
 .../hip_fmha/ck_tiled_fmha_batched_backward.h | 82 +++++++++---------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h | 86 +++++++++----------
 3 files changed, 88 insertions(+), 90 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index e9b53ce81..0e8401959 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -545,11 +545,13 @@ efficient_attention_backward_ck(
     grad_v = tmp_grad_v_view.sum(3);
   }
 
-  if (inDataType == at::ScalarType::Half)
-    grad_q = grad_q_f32.to(torch::kFloat16);
+  /*
+    if (inDataType == at::ScalarType::Half)
+      grad_q = grad_q_f32.to(torch::kFloat16);
 
-  if (inDataType == at::ScalarType::BFloat16)
-    grad_q = grad_q_f32.to(torch::kBFloat16);
+    if (inDataType == at::ScalarType::BFloat16)
+      grad_q = grad_q_f32.to(torch::kBFloat16);
+  */
 
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 }
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 4fb5f7086..502ab4e9e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -163,48 +163,46 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
         });
       });
     };
-    /*
-        if constexpr (NeedConvertGradQ) {
-          constexpr ck_tile::index_t kBlockSize = 256;
-
-          const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
-          const bool pad_headdim_q =
-              !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
-
-          BOOL_SWITCH_2(
-              pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
-                constexpr ck_tile::index_t occupancy = 2;
-
-                using FmhaBwdConvertQGradTraits_ =
-                    ck_tile::TileFmhaBwdConvertQGradTraits<
-                        kPadSeqLenQ,
-                        kPadHeadDimQ,
-                        occupancy>;
-
-                using FmhaBwdConvertQGradPipelineProblem =
-                    ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
-                        typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
-                        typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
-                        kBlockSize,
-                        FmhaBwdShape<MaxK>::kM0,
-                        FmhaBwdShape<MaxK>::kN0,
-                        FmhaBwdShape<MaxK>::kQKHeaddim,
-                        false, // kIsGroupMode
-                        false, // kIsDeterministic
-                        FmhaBwdConvertQGradTraits_>;
-
-                using FmhaBwdConvertQGradPipeline =
-                    typename ck_tile::BlockFmhaBwdConvertQGrad<
-                        FmhaBwdConvertQGradPipelineProblem>;
-
-                using FmhaBwdConvertQGradKernel_ =
-                    ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
-
-                RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
-                    param, stream);
-              });
-        };
-    */
+    if constexpr (NeedConvertGradQ) {
+      constexpr ck_tile::index_t kBlockSize = 256;
+
+      const bool pad_seqlen_q = !(param.M % kBlockSize == 0);
+      const bool pad_headdim_q =
+          !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+
+      BOOL_SWITCH_2(
+          pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
+            constexpr ck_tile::index_t occupancy = 2;
+
+            using FmhaBwdConvertQGradTraits_ =
+                ck_tile::TileFmhaBwdConvertQGradTraits<
+                    kPadSeqLenQ,
+                    kPadHeadDimQ,
+                    occupancy>;
+
+            using FmhaBwdConvertQGradPipelineProblem =
+                ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
+                    typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+                    kBlockSize,
+                    FmhaBwdShape<MaxK>::kM0,
+                    FmhaBwdShape<MaxK>::kN0,
+                    FmhaBwdShape<MaxK>::kQKHeaddim,
+                    false, // kIsGroupMode
+                    false, // kIsDeterministic
+                    FmhaBwdConvertQGradTraits_>;
+
+            using FmhaBwdConvertQGradPipeline =
+                typename ck_tile::BlockFmhaBwdConvertQGrad<
+                    FmhaBwdConvertQGradPipelineProblem>;
+
+            using FmhaBwdConvertQGradKernel_ =
+                ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
+
+            RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
+                param, stream);
+          });
+    };
   }
 
   template <typename FmhaBwdOGradDotOKernel>
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 599bfac68..8b0cd4dad 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -161,48 +161,46 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       });
     };
 
-    /*
-        if constexpr (NeedConvertGradQ) {
-          constexpr ck_tile::index_t kBlockSize = 256;
-
-          const bool pad_seqlen_q = true;
-          const bool pad_headdim_q =
-              !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
-
-          BOOL_SWITCH_2(
-              pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
-                constexpr ck_tile::index_t occupancy = 2;
-
-                using FmhaBwdConvertQGradTraits_ =
-                    ck_tile::TileFmhaBwdConvertQGradTraits<
-                        kPadSeqLenQ,
-                        kPadHeadDimQ,
-                        occupancy>;
-
-                using FmhaBwdConvertQGradPipelineProblem =
-                    ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
-                        typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
-                        typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
-                        kBlockSize,
-                        64, // kM0
-                        1, // kN0, no use
-                        FmhaBwdShape<MaxK>::kQKHeaddim,
-                        true, // kIsGroupMode
-                        false, // kIsDeterministic
-                        FmhaBwdConvertQGradTraits_>;
-
-                using FmhaBwdConvertQGradPipeline =
-                    typename ck_tile::BlockFmhaBwdConvertQGrad<
-                        FmhaBwdConvertQGradPipelineProblem>;
-
-                using FmhaBwdConvertQGradKernel_ =
-                    ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
-
-                RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
-                    param, stream);
-              });
-        };
-    */
+    if constexpr (NeedConvertGradQ) {
+      constexpr ck_tile::index_t kBlockSize = 128;
+
+      const bool pad_seqlen_q = true;
+      const bool pad_headdim_q =
+          !(param.K % FmhaBwdShape<MaxK>::kQKHeaddim == 0);
+
+      BOOL_SWITCH_2(
+          pad_seqlen_q, kPadSeqLenQ, pad_headdim_q, kPadHeadDimQ, [&] {
+            constexpr ck_tile::index_t occupancy = 2;
+
+            using FmhaBwdConvertQGradTraits_ =
+                ck_tile::TileFmhaBwdConvertQGradTraits<
+                    kPadSeqLenQ,
+                    kPadHeadDimQ,
+                    occupancy>;
+
+            using FmhaBwdConvertQGradPipelineProblem =
+                ck_tile::BlockFmhaBwdConvertQGradPipelineProblem<
+                    typename FmhaBwdTypeConfig<ScalarType>::AccDataType,
+                    typename FmhaBwdTypeConfig<ScalarType>::QGradDataType,
+                    kBlockSize,
+                    64, // kM0
+                    1, // kN0, no use
+                    FmhaBwdShape<MaxK>::kQKHeaddim,
+                    true, // kIsGroupMode
+                    false, // kIsDeterministic
+                    FmhaBwdConvertQGradTraits_>;
+
+            using FmhaBwdConvertQGradPipeline =
+                typename ck_tile::BlockFmhaBwdConvertQGrad<
+                    FmhaBwdConvertQGradPipelineProblem>;
+
+            using FmhaBwdConvertQGradKernel_ =
+                ck_tile::FmhaBwdConvertQGradKernel<FmhaBwdConvertQGradPipeline>;
+
+            RunWithBwdConvertQGradKernel<FmhaBwdConvertQGradKernel_>(
+                param, stream);
+          });
+    };
   }
 
   template <typename FmhaBwdOGradDotOKernel>
@@ -320,10 +318,10 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.seqstart_q_dev_ptr,
           param.seqstart_k_dev_ptr,
           param.K, // headdim of q/k
+          param.q_strides[0],
+          param.grad_q_f32_strides[0],
           param.q_strides[1],
           param.grad_q_f32_strides[1],
-          param.q_strides[2],
-          param.grad_q_f32_strides[2],
           0);
     }();
 

From b0437654803a36021e43c8399495a3061b0045be Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Aug 2024 09:29:48 +0000
Subject: [PATCH 602/641] Update to use unpadded lse layout

---
 third_party/composable_kernel_tiled            |  2 +-
 .../attention_backward_generic_ck_tiled.cpp    | 11 +++++------
 .../attention_forward_generic_ck_tiled.cpp     | 10 +++-------
 .../hip_fmha/ck_tiled_fmha_grouped_backward.h  |  6 ++----
 .../hip_fmha/ck_tiled_fmha_grouped_forward.h   |  3 +--
 .../hip_fmha/ck_tiled_fmha_grouped_infer.h     |  1 -
 .../attention/hip_fmha/ck_tiled_fmha_params.h  | 18 ++++++++++--------
 xformers/ops/fmha/ck.py                        |  1 +
 8 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 25db13392..e6c489df4 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 25db1339265fa020d457e13d8440786d647fcc23
+Subproject commit e6c489df4980e676af15010a9c26f1aaee270ef8
diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 0e8401959..700adeba5 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -216,7 +216,7 @@ efficient_attention_backward_ck(
 
     TORCH_CHECK(p.B == logsumexp.size(0));
     TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.M <= logsumexp.size(2));
+    TORCH_CHECK(p.M == logsumexp.size(2));
 
     if (scale.has_value()) {
       p.scale = float(*scale);
@@ -353,9 +353,9 @@ efficient_attention_backward_ck(
     p.max_seqlen_q = *max_seqlen_q_;
     p.max_seqlen_k = *max_seqlen_k_;
 
-    TORCH_CHECK(p.num_batches == logsumexp.size(0));
-    TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.max_seqlen_q <= logsumexp.size(2));
+    // unpadded lse layout required
+    TORCH_CHECK(p.Hq == logsumexp.size(0));
+    TORCH_CHECK(p.M == logsumexp.size(1));
 
     if (scale.has_value())
       p.scale = float(*scale);
@@ -385,8 +385,7 @@ efficient_attention_backward_ck(
 
     p.lsed_strides = {
         static_cast<int>(logsumexp.stride(0)),
-        static_cast<int>(logsumexp.stride(1)),
-        static_cast<int>(logsumexp.stride(2))};
+        static_cast<int>(logsumexp.stride(1))};
 
     if (use_grad_q_f32) {
       p.grad_q_f32_strides = {
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index fb29c7d21..fa6e0127a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -316,18 +316,14 @@ efficient_attention_forward_ck(
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
-      // align the access of logsumexp by each thread-group in cache-line size
-      int aligned_seqlen_q = (p.max_seqlen_q + 15) / 16 * 16;
-      logsumexp = at::empty(
-          {p.num_batches, Hq, aligned_seqlen_q}, opts.dtype(at::kFloat));
+      logsumexp = at::empty({Hq, M}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
       p.lse_strides = {
           static_cast<int>(logsumexp.stride(0)),
-          static_cast<int>(logsumexp.stride(1)),
-          static_cast<int>(logsumexp.stride(2))};
+          static_cast<int>(logsumexp.stride(1))};
     } else {
       p.logsumexp_ptr = nullptr;
-      p.lse_strides = {0, 0, 0};
+      p.lse_strides = {0, 0};
     }
   };
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 8b0cd4dad..5ca27a0c5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -219,8 +219,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.out_strides[0], // stride_o
           param.grad_out_strides[1], // nhead_stride_do
           param.out_strides[1], // nhead_stride_o
-          param.lsed_strides[1],
-          param.lsed_strides[0]); // batch_stride_d
+          param.lsed_strides[0]); // nhead_stride_d
     }();
 
     dim3 kGridSize = FmhaBwdOGradDotOKernel::GridSize(
@@ -280,13 +279,12 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
           param.attn_bias_strides[0],
           0, // nhead_stride_randval
           param.grad_out_strides[1],
-          param.lsed_strides[1], // assume lse/dot is in BHM contiguous layout
+          param.lsed_strides[0], // assume lse/dot is in HM contiguous layout
           NeedConvertGradQ ? param.grad_q_f32_strides[1] : param.q_strides[1],
           param.grad_k_strides[1],
           param.grad_v_strides[1],
           param.attn_bias_strides[0], // assume grad_bias has same strides as
                                       // bias
-          param.lsed_strides[0], // batch_stride_lse
           0, // split_stride_dq_acc
           (param.window_size > 0) ? param.window_size - 1
                                   : -1, // window_left_size
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
index 2fa305e0a..519a5ea89 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward.h
@@ -150,9 +150,8 @@ struct grouped_forward_causalmask_bias_dropout_dispatch {
           param.v_strides[1],
           param.attn_bias_strides[1],
           0, // nhead_stride_randval
-          param.lse_strides[1],
+          param.lse_strides[0],
           param.out_strides[1],
-          param.lse_strides[0], // batch_stride_lse
           (param.window_size > 0) ? param.window_size - 1
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index 5197a6cb1..d4a6c9dbd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -196,7 +196,6 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
           0, // nhead_stride_randval
           0, // nhead_stride_lse
           param.out_strides[1],
-          0, // batch_stride_lse
           (param.window_size > 0) ? param.window_size - 1
                                   : -1, // window_left_size
           (param.custom_mask_type == 0) ? -1 : 0, // window_right_size
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
index 3d94060dd..ce86f6df4 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_params.h
@@ -28,9 +28,6 @@ struct BatchedInferParams {
   std::array<int, 4> out_strides;
   std::array<int, 4> attn_bias_strides; // 4d tensor_view [B, H, M, N]
 
-  // BHM mode strides, completely contiguous
-  std::array<int, 3> lse_strides;
-
   const void* q_ptr;
   const void* k_ptr;
   const void* v_ptr;
@@ -49,6 +46,9 @@ struct BatchedForwardParams : public BatchedInferParams {
   int64_t philox_seed;
   int64_t philox_offset;
 
+  // BHM mode strides, completely contiguous
+  std::array<int, 3> lse_strides;
+
   // completely contiguous
   void* logsumexp_ptr;
 };
@@ -80,9 +80,6 @@ struct GroupedInferParams {
   // 4d tensor view [B, H, M, N]
   std::array<int, 4> attn_bias_strides;
 
-  // BHM mode strides, completely contiguous
-  std::array<int, 3> lse_strides;
-
   const void* q_ptr;
   const void* k_ptr;
   const void* v_ptr;
@@ -102,6 +99,10 @@ struct GroupedForwardParams : public GroupedInferParams {
   int64_t philox_seed;
   int64_t philox_offset;
 
+  // HM mode strides, completely contiguous, unpadded layout where M is
+  // concatten total seqlen_q for all batches
+  std::array<int, 2> lse_strides;
+
   // completely contiguous
   void* logsumexp_ptr;
 };
@@ -201,8 +202,9 @@ struct GroupedBackwardParams {
   // assume grad_q has same strides as q, but grad_q_f32 can be different
   std::array<int, 3> grad_q_f32_strides;
 
-  // BHM mode strides, completely contiguous
-  std::array<int, 3> lsed_strides;
+  // HM mode strides, completely contiguous, unpadded layout where M is
+  // concatten total seqlen_q for all batches
+  std::array<int, 2> lsed_strides;
 
   const void* q_ptr;
   const void* k_ptr;
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 2de81623c..365ff76eb 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -362,6 +362,7 @@ class BwOp(AttentionBwOpBase):
     SUPPORTS_DROPOUT = FwOp.SUPPORTS_DROPOUT
     SUPPORTS_CUSTOM_SCALE = FwOp.SUPPORTS_CUSTOM_SCALE
     SUPPORTS_DIFFERENT_VALUE_EMBED = FwOp.SUPPORTS_DIFFERENT_VALUE_EMBED
+    SUPPORTS_UNPADDED_LSE = True
     NAME = "ckB"
 
     _TEST_K: List[int] = [

From c9e7595a11e03aebc7c1805fc05c97fd58771b79 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Aug 2024 16:35:30 +0000
Subject: [PATCH 603/641] Add explicit headdim256 instances for fmha backward

---
 third_party/composable_kernel_tiled           |  2 +-
 .../ck_tiled_fmha_batched_backward_bf16.cpp   | 13 ++++++++++++
 .../ck_tiled_fmha_batched_backward_fp16.cpp   | 13 ++++++++++++
 .../ck_tiled_fmha_grouped_backward_bf16.cpp   | 13 ++++++++++++
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   | 13 ++++++++++++
 .../attention/hip_fmha/generate_instances.py  |  2 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp | 20 +++++++++++++++++++
 54 files changed, 1014 insertions(+), 2 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index e6c489df4..0178da6f5 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit e6c489df4980e676af15010a9c26f1aaee270ef8
+Subproject commit 0178da6f5071171df3362bb9d419b4da0feb3765
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
index a9e17ee73..1215498e9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
@@ -89,6 +89,19 @@ extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_ti
     BatchedBackwardParams& param, hipStream_t stream);
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_backward_bf16(BatchedBackwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index 17c4aa9d3..e1f442c2f 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -89,6 +89,19 @@ extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_ti
     BatchedBackwardParams& param, hipStream_t stream);
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
index 5d08a4d72..2f04ca0b2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
@@ -89,6 +89,19 @@ extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_ti
     GroupedBackwardParams& param, hipStream_t stream);
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_backward_bf16(GroupedBackwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index 266cd0ad1..8d97bc180 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -89,6 +89,19 @@ extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_ti
     GroupedBackwardParams& param, hipStream_t stream);
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
 // clang-format on
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index 4abd46ec5..1a5033f97 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -175,7 +175,7 @@ def create_backward_instances(instance_dir: Path) -> None:
             for has_causalmask in [True, False]:
                 for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
                     for has_dropout in [True, False]:
-                        for max_k in [32, 64, 128]:
+                        for max_k in [32, 64, 128, 256]:
                             fname = FMHA_BACKWARD_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..a92e5f8cb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..0928f59bb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..670b672e9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..b3e989122
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..a0bf3f96a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..53e5698cb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..94e149aac
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..7260605ba
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..dd83434ae
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..ab068fd9f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..f8709cb22
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..b5293053e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..9ff4c121f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..e85335338
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..743c74a29
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..84f934fff
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..3683f701f
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..a48129acd
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..96e8fe198
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..380446dcc
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..2d8fb4f54
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..934253c9c
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..de8a13a8a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..3d7b4d235
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..bf91a8aae
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..bf8cc800d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..613249768
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..a7987553b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..f71a97734
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..8986817c2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..677b48f17
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..4031048c9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..9287971e7
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..918db4a7d
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..06f4dfdee
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..046695fa2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..1955fc406
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..9958105c9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..e45e7a153
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..4f8264bbf
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..7a642504b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..f77bf801b
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..b9eb3e927
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..5620850df
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..d21c1beeb
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..577345d8e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
new file mode 100644
index 000000000..267270591
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
new file mode 100644
index 000000000..e2f0e69e2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -0,0 +1,20 @@
+
+/*
+  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);

From 4a7b7dc97babe923a2710a849e3bd5b76fee03b5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Aug 2024 16:55:10 +0000
Subject: [PATCH 604/641] Add leaked headdim256 instance references

---
 .../ck_tiled_fmha_batched_backward_bf16.cpp         | 13 +++++++++++++
 .../ck_tiled_fmha_batched_backward_fp16.cpp         | 13 +++++++++++++
 .../ck_tiled_fmha_grouped_backward_bf16.cpp         | 13 +++++++++++++
 .../ck_tiled_fmha_grouped_backward_fp16.cpp         | 13 +++++++++++++
 4 files changed, 52 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
index 1215498e9..fdec15de2 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
@@ -90,6 +90,19 @@ extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_ti
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 256>(
     BatchedBackwardParams& param, hipStream_t stream);
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 256>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index e1f442c2f..e795eb9d3 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -90,6 +90,19 @@ extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_ti
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
     BatchedBackwardParams& param, hipStream_t stream);
 
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 256>(
+    BatchedBackwardParams& param, hipStream_t stream);
+
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 256>(
     BatchedBackwardParams& param, hipStream_t stream);
 extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 256>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
index 2f04ca0b2..4250bba47 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
@@ -90,6 +90,19 @@ extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_ti
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 256>(
     GroupedBackwardParams& param, hipStream_t stream);
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 256>(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index 8d97bc180..baca24387 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -90,6 +90,19 @@ extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_ti
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
     GroupedBackwardParams& param, hipStream_t stream);
 
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 256>(
+    GroupedBackwardParams& param, hipStream_t stream);
+
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 256>(
     GroupedBackwardParams& param, hipStream_t stream);
 extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 256>(

From 1ad9cbeeaa277980ecd312c534bbdd8e0e545af3 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Aug 2024 18:03:11 +0000
Subject: [PATCH 605/641] Change to generate.py and the re-generate the
 instance files using it

---
 .../attention/hip_fmha/generate_instances.py  | 48 +++++++++++++------
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ...bias_has_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ..._bias_has_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ..._bias_has_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...s_bias_has_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...s_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...s_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...as_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_128.cpp |  3 +-
 ..._bias_no_biasgrad_has_dropout_maxk_256.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_32.cpp |  3 +-
 ...o_bias_no_biasgrad_has_dropout_maxk_64.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_128.cpp |  3 +-
 ...o_bias_no_biasgrad_no_dropout_maxk_256.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_32.cpp |  3 +-
 ...no_bias_no_biasgrad_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_128.cpp |  3 +-
 ...usalmask_has_bias_has_dropout_maxk_256.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_32.cpp |  3 +-
 ...ausalmask_has_bias_has_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_has_bias_no_dropout_maxk_256.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_32.cpp |  3 +-
 ...causalmask_has_bias_no_dropout_maxk_64.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_128.cpp |  3 +-
 ...ausalmask_no_bias_has_dropout_maxk_256.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_32.cpp |  3 +-
 ...causalmask_no_bias_has_dropout_maxk_64.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_128.cpp |  3 +-
 ...causalmask_no_bias_no_dropout_maxk_256.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_32.cpp |  3 +-
 ..._causalmask_no_bias_no_dropout_maxk_64.cpp |  3 +-
 449 files changed, 930 insertions(+), 462 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index 1a5033f97..0975520ef 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -8,9 +8,9 @@
 import os
 from pathlib import Path
 
-FMHA_INSTANCE_HEADER = """
+FMHA_COPYRIGHT_HEADER = """
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -19,11 +19,13 @@
  */
 """
 
-FMHA_INFER_INSTANCE_TEMPLATE = """
+FMHA_INFER_INSTANCE_TEMPLATE_INC = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_infer.h\"
+"""
 
-template void run_{mode}_infer_causalmask_bias_dropout_dispatch<
+FMHA_INFER_INSTANCE_TEMPLATE = """
+{extern}template void run_{mode}_infer_causalmask_bias_dropout_dispatch<
     {dtype},
     {has_causalmask},
     {has_bias},
@@ -34,11 +36,13 @@
 FMHA_INFER_INSTANCE_FNAME = "fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_"\
                             "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
-FMHA_FORWARD_INSTANCE_TEMPLATE = """
+FMHA_FORWARD_INSTANCE_TEMPLATE_INC = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_forward.h\"
+"""
 
-template void run_{mode}_forward_causalmask_bias_dropout_dispatch<
+FMHA_FORWARD_INSTANCE_TEMPLATE = """
+{extern}template void run_{mode}_forward_causalmask_bias_dropout_dispatch<
     {dtype},
     {has_causalmask},
     {has_bias},
@@ -49,11 +53,13 @@
 FMHA_FORWARD_INSTANCE_FNAME = "fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_"\
                               "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
-FMHA_BACKWARD_INSTANCE_TEMPLATE = """
+FMHA_BACKWARD_INSTANCE_TEMPLATE_INC = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
 #include \"ck_tiled_fmha_{mode}_backward.h\"
+"""
 
-template void run_{mode}_backward_causalmask_bias_dropout_dispatch<
+FMHA_BACKWARD_INSTANCE_TEMPLATE = """
+{extern}template void run_{mode}_backward_causalmask_bias_dropout_dispatch<
     {dtype},
     {has_causalmask},
     {has_bias},
@@ -65,6 +71,8 @@
 FMHA_BACKWARD_INSTANCE_FNAME = "fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_"\
                                "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
+FMHA_INSTANCE_REF_FNAME = "fmha_{mode}_{function}_{dtype}.hpp"
+
 BOOL_MAP = {
     True : "true",
     False : "false"
@@ -128,9 +136,13 @@ def create_infer_instances(instance_dir: Path) -> None:
                                 has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
                                 max_k_str=INT_MAP_MAX_K[max_k],
                             )
-                            infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
+                            infer_instance_inc = FMHA_INFER_INSTANCE_TEMPLATE_INC.format(
                                 mode=mode,
                                 dtype_file=TYPE_FNAME_MAP[dtype],
+                            )
+                            infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
+                                extern="",
+                                mode=mode,
                                 dtype=TYPE_CTYPE_MAP[dtype],
                                 has_causalmask=BOOL_MAP[has_causalmask],
                                 has_bias=BOOL_MAP[has_bias],
@@ -138,7 +150,7 @@ def create_infer_instances(instance_dir: Path) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + infer_instance_inc + "\n" + infer_instance)
 
 
 def create_forward_instances(instance_dir: Path) -> None:
@@ -156,9 +168,13 @@ def create_forward_instances(instance_dir: Path) -> None:
                                 has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
                                 max_k_str=INT_MAP_MAX_K[max_k],
                             )
-                            infer_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
+                            forward_instance_inc = FMHA_FORWARD_INSTANCE_TEMPLATE_INC.format(
                                 mode=mode,
                                 dtype_file=TYPE_FNAME_MAP[dtype],
+                            ) 
+                            forward_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
+                                extern="",
+                                mode=mode,
                                 dtype=TYPE_CTYPE_MAP[dtype],
                                 has_causalmask=BOOL_MAP[has_causalmask],
                                 has_bias=BOOL_MAP[has_bias],
@@ -166,7 +182,7 @@ def create_forward_instances(instance_dir: Path) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + forward_instance_inc + "\n" + forward_instance)
 
 
 def create_backward_instances(instance_dir: Path) -> None:
@@ -185,9 +201,13 @@ def create_backward_instances(instance_dir: Path) -> None:
                                 has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
                                 max_k_str=INT_MAP_MAX_K[max_k],
                             )
-                            infer_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
+                            backward_instance_inc = FMHA_BACKWARD_INSTANCE_TEMPLATE_INC.format(
                                 mode=mode,
                                 dtype_file=TYPE_FNAME_MAP[dtype],
+                            )
+                            backward_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
+                                extern="",
+                                mode=mode,
                                 dtype=TYPE_CTYPE_MAP[dtype],
                                 has_causalmask=BOOL_MAP[has_causalmask],
                                 has_bias=BOOL_MAP[has_bias],
@@ -196,7 +216,7 @@ def create_backward_instances(instance_dir: Path) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_INSTANCE_HEADER + infer_instance)
+                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + backward_instance_inc + "\n" + backward_instance)
 
 
 if __name__ == "__main__":
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 97f209cb6..39232e65d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index a92e5f8cb..76157bf99 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 5c0e89e21..4b774cf68 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 5e3392493..c8ba202be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index ae9158e21..6742fb592 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 0928f59bb..b0615cb13 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index dfc929276..dc1dfba3e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index a915f8aa5..85560dae3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 7e17c9298..45ee4fd6d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 670b672e9..cc4febe21 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 8d980af34..77f5824dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index be31aa59b..0943e233c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 7ea9cb0a9..59206114f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index b3e989122..1170edbe5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index a2a9dd4d6..fa0ad59b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 594a62ff5..4a14da080 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 0307f9ab2..5c5af08af 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index a0bf3f96a..1edf2b647 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 5a7cd479a..c13203a0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index e1280f6d2..edf535c0b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 04a107af4..b3a8f1a3b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 53e5698cb..d0475fb79 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 0a41a2f27..6d0f48867 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 49d6b9641..4d60a8589 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index f5ce7c5bb..0100f090f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 94e149aac..1f3bb92cb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 41ff265c7..04db3afad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index f6b776650..e18a4bd4a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 7f4013aaf..5df78e1ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 7260605ba..323d799b5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 5241a1b1f..82b8af2ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index f5ee944eb..573826492 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 8ab3f930c..3ba12bc99 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index dd83434ae..5d0025622 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index c757b7d35..17ed22594 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 4b3d9f256..fd4ba2dfd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 03455ee6e..4cb221876 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index ab068fd9f..00091e827 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 48a501539..24eb9cf98 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d73c780a6..77008bcf5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index c0636a905..16c697a85 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index f8709cb22..9ee060f32 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 3da3474df..16628b31b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 6ed11608d..0c47e21db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 3cca920f5..65b0a11e8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index b5293053e..7e1d1835d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 6383d494e..52c1f82bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 585dc69f3..3ae27d64c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 6ca73178d..6bda7dca0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 9ff4c121f..62bb4da51 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 95218766e..2c6f31641 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index bf092ff96..85e8c719f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 394bbbe28..dbfc26d1e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index e85335338..c18a7439a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index ea3884557..b989377a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 4596bfd7f..0c0fe40d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index e1d72bc58..537e9e0fa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 743c74a29..dece0aa4e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 96f62e9ac..79f162f27 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index dd72c62f2..d9c163f84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index a0d7a83d9..37f622753 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 84f934fff..1e312cf7f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index e2d01f97e..03cb14d16 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index d5378b3f3..fdd5cc6c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 02c8c9bc5..ffa0b948e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 3683f701f..e77bb21e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 8057c759e..c0f9ee654 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index af6091b25..082436890 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 3fc748ff2..478e39315 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index a48129acd..3c6658897 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index b9b6aacfe..58cb8d427 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 8b667d2f7..04a808a3a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index df1e6c3c0..6291955c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 96e8fe198..3a445cab9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index f415d9464..05a23fe81 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index ff8d33f21..eed061f45 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 41da7ab90..04da2d7f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 380446dcc..0971c2582 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 340fb65ee..60ef436f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index be7f2144d..568c619f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 0932fbb12..19e27101a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 2d8fb4f54..c13031bcd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index eaafd9949..c9716e3a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 02cf83aba..fb4b25492 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 51bd8bedb..045baff41 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 934253c9c..5a9b9b630 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 7f999c203..6e7b5e211 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 3ad410861..68ccee8e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 90572aabf..d3dbae9d5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index de8a13a8a..8762b721b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 9c0000820..b85e7c5a5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 13902640d..d691bcaec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 82849155e..408729a17 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 3d7b4d235..50d564927 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 81636cea6..5855ede14 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 97775f0e2..b329eeca0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
+
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 5a639ee11..bd85a5fdc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 29cf57025..2529a096f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index c60d415d4..3bde17cb3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index f6291e2db..50ff42476 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index caec04c71..44cd6d4d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index ae29f02a3..04934417a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 71eda93e9..29d774316 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index aa31f0f84..f7a6fed93 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 551c4eb67..73e6a902a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 1d6e78baf..f199398c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 278f6d358..bfac0e729 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 18e12c0a4..bdbb9f67a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index d393e26c3..a02390265 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index e5e99ede0..6cf0c876c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 672b58be1..a4e1acd3b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index ed42d7c0b..42c97b8cc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 7e71f6b27..dd1b22159 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 5f0af8c18..c5cc1590f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3aac80d51..ee0cc1d99 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 8018e467f..14142b105 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 0266d3a36..275fd42c1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index d327faf63..5fd214297 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index af2c6e8de..4decc0120 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 722dc77bb..3fd53bff1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 9ab840b67..1b2c2d743 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 6b6c4b6a1..4f27dd5af 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index afd3bcfc3..b6e8741bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index a349964c0..3ab275d8b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 03eb236cc..84a92844c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 19dc010e4..d381d7190 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 14272770f..37d55967f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index bf7aefc53..afc8a232f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 6e2e94259..faef825e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index e08bb00a1..846e10f69 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 96de7b864..6b5be61df 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index f82f2b471..84a34acd4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 60eda29ce..4ed15b231 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 9cb7c591b..378ccb400 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index effc47a63..5b99bd861 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 477ec5f36..a43b7f87e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index b75a4f46f..50627005f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 322d9c2e2..b98232fda 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 77fb6a604..b594cf6e4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 57214e6f3..f18fba3bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 3b4f1be34..5ba04db66 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index afc858efb..6828d19a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index bdf207633..e75c9823d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index ea656db19..49cce8e9d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 5d65d7ae7..cccd03ce4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 709138805..73fff51b8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index c50e52c86..d8ab68fa4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 1808842fc..807f27935 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 367c420a4..5695adc9c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 8f213bfef..fb68f8181 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index fd5da6b77..ba89bc3ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 70e0723bb..3e3f6ec50 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 4f8e39ac1..10871d7ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 3d3be36e9..56e2dce4b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 21aae8f7c..b37f432d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 514a01a39..81962fc30 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index c67d1c653..56e6306f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 810036325..11bcea176 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 7dda46c89..660e70185 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 2392b9498..69596971e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
+
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 74743b024..ebca11eb3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 20290bab8..5601af4e0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index ab3225bd4..daa20d691 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 310442726..0f5bbf5dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index af36d315e..884dffccd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index b25e1be08..05d0edb57 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 5e660a8ea..40ee28738 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 39153d92f..9ad0b9fab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index bf3c3f21a..a4e20b1cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index e9c1c0551..2132bab64 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index e35a1e7a5..7933827a8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 577972843..2bf8f82a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index bb48b49d2..2fbbf6236 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index d13429529..d1180dd33 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 5d44df43a..2c56e4e56 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index aadd0fcca..e079e0748 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 034275f69..0d9d667e1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index c922b00c0..2e0b100ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 8edd6fed5..b2712fce6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index e2d8ba101..19321447a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 9e9adf31d..8d33e6d0a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 306829eaf..1a77a9ed2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 8bfc62104..14f62535a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index fe81acab4..ed8caf20d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index bcf5b783f..a3b553aaa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index ba5a41450..c645172e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 9cac1c3af..a92504458 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index e31ed4362..a6d9ec1ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 9f52f52be..2d3f4711c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 9ba93c82c..4e87793d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index fec45193d..b627025e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 571f8ad48..ff2957c10 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 76447cfef..c5cf71b09 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 94e2e0dfc..3cda93ebd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 432d955b7..d99c733b4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 173d18aaf..e0e604f1c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 7661a50d3..9148a2624 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index b3e43957f..45d96f13d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index f54aa9ef4..a0096a6d7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 17f4018c3..a16e08a30 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index d5ea02d7c..5adffd056 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 2e4a6769e..7004a13a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 6caae1a75..f8cad2c3a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index c01f1105b..1270dd2ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 4e146ec41..647c50792 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index e5bc54c2c..a85a5360d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index ac3f5d082..3c12b1e8a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 3f39b0323..fa214ebf7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 7440bc503..3d12babd5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index efaf98472..5231f0d2e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 0820075e5..97c433883 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 89dace195..b744f412b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 95f57c099..e9701e2db 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index c8ac55329..075610634 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 10a261f3d..ab5423bfd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 721145717..6a08c4772 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index be3100082..44a3a6a76 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 7c70e53b9..8444c310e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 75f733259..3cb04e9d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 50507e69c..ea7862776 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 931040548..809acb6e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index a1a08d4d5..59c1812b0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 200706066..23c34e385 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 9db040363..3f5085b29 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
+
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 72fec2837..da52b4524 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index bf91a8aae..1e61eb1e1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 5b3551d3b..136309d34 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index c9ca1a559..06c6d3252 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 09daabcfa..10edbf6c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index bf8cc800d..0c8ebfca6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 0bc605677..d43472c5c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 489610171..2002eecd6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 3e9ba0cba..ae5874ec2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 613249768..8436316d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 3e13c1b17..fc0a04b31 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index b5023fdc8..f94f947a7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 7c3a7a165..875c8acfd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index a7987553b..ec424034e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 73cd48382..75c82d385 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index f9163241f..1ac2b6c68 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 55fa67c3d..4d99c381d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index f71a97734..b39de523c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 3549f1148..57bfe1e9b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index e8735e590..671cf1f5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 43586d91c..8d8044832 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 8986817c2..646e3dc93 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 6e6e44a15..e3be7a247 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 16c69fc8f..aace93798 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index c590ef5a4..22e1faa7b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 677b48f17..6f43a6f29 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 6e283c09f..00b6b1fe2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 6d3aebee2..8f635c6a9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 62da5b2b3..6ce4770a8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 4031048c9..b19238d3e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 28184d919..cfc040870 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index a1cdf5607..57280c0f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 36a047ac7..38106aded 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 9287971e7..a98415a60 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 3930123b2..142824508 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 60bd6d5c7..6d9ce7550 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 549983dc4..9f4f7944b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 918db4a7d..05a9e830c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 8c32f736f..469f7ee4a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index e4a8919eb..bc76b94c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index d88c4a1e0..a504db1c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 06f4dfdee..8a5e31b51 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 8aeb02787..f5c628c18 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index a41d5eace..2bc167aa7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 324e1f0d0..b06c9143e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 046695fa2..a03c7b019 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 630e0f72c..542c82ac2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index b2b7066df..02c6caf0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 9f7544038..647dfed39 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 1955fc406..8408f10e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index ab6c752ab..6f6baa130 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 988114605..fba9304bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 539311424..c319c597a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 9958105c9..e3740d923 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 34dd66471..e630b82b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 88305d7de..adaf82000 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 4ff2f792b..ac94963de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index e45e7a153..39d892476 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 9534a7f50..508db91ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 906dcd51b..b83f716fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 926aadb7f..864c54707 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 4f8264bbf..b3c02ddb1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 5c29ff3c0..dd433cf6b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 75684001a..2b8bbd000 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 13e995979..c23499359 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 7a642504b..3e7281c9f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index d41ee2d19..f2bcef822 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 702a3bf4f..2c17644ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index b450ef78d..fa7b75bad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index f77bf801b..8b8d3e18c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index be18be183..e4f6da1fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index b93c05261..03ce989bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index fc26a3025..dc4d9bce7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index b9eb3e927..3197e15f4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 841cc31e5..7707a22ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index f2865241c..ec91dbaff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 35edebe38..3d57e18f1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 5620850df..c851179fe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 8e0d32d5a..3e0b2cefa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 573ec892b..6630c3d74 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 33f9cace9..18683ea06 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index d21c1beeb..cf38ccdd0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 683918a99..67e7fc14c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index e0c419d2f..e4cb050b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 52e41c45d..a6f62c5ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 577345d8e..faf27d95b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index acdf13265..e7552bea0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 6729d5917..43e0658b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 072115903..ff26b66be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 267270591..76a5236c1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 64ff3db39..cbb0cdf16 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index f3acd7e17..7277f375a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index d78c56731..e1b1d55d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index e2f0e69e2..bff058814 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 06dc769b9..9d0eb19ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 63928f3a2..80e3e5d31 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
+
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 55e21c75a..8d3f1699a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 7c1c89f54..872b8feb9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 9453c7d2c..e7e556194 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 888c865cd..fad634dd7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 1e1231370..1cee53160 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 03625b779..b11085627 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index b99a04d7a..78f288862 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 12c1b6a90..14a9250aa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 42a6cea30..ea0d4e867 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 81d679689..3eae57ea0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index e614abdaa..de9de2f4b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 339f99255..f0309768d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 64b61826f..716e34fe4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 4983a4ac1..f4982d3b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index fa7649dea..f8bb2bf07 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 3a24474ba..ba9874ee7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 57e895ae9..0f9de6935 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index b975fa34c..74ac7d90c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3be314a73..dfd68d087 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 733debc01..0d83cb462 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index b762d178c..008d2e68f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 7d8648a26..254abd1fa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 28a21d93f..38b336e01 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 2fe0721c6..efc6e40de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 159489e9d..49924fbf2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 507aabe2d..ef83ee445 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index db7d8ed17..535f3877a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index c95898882..a89bc6bb4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 4c5395bed..1276d65a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 487acd8fa..4a36334e4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 913d55757..3505c9a97 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 137da7aaf..169fec04a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 68a75552a..ce25186d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 0603f0d1c..f9633bbfd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 2ba93fcc1..e5292f882 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 4f95470a5..aa89d62e8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index c12483acf..c34d945e0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index d2bb3b0f2..67690c1e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 76752b2e6..d332e50ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 2658965bc..6c9735dd1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 3715f9e40..9b0e515e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index df210e2b1..8a6aac9d4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 0acee7775..91d7974f7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 91e6d0778..ac69a855e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 4c2b6ca25..938d8a2ef 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 5a2df731e..9f3432708 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 2492c47ea..1f5470478 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 7cd86ff79..8f30d330e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 892446459..65fc8ffe9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index e6914af9d..35b9221d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3acb390fe..9c598402e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index b395d5671..08ae9091b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index a65035381..6c295a8f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 547fef8b1..f1345945f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 8ec916502..6c212f9ad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 1f3195d6e..d934dad1b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 1498a7d09..36e76fb54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 858d55e00..56ada7742 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 72b4db4f8..fdc02134c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 237cbc71c..c38442bc5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index a40d4a3a3..c31359a77 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 9fb5462a0..b57f76adb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 832ee6f82..377af2368 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index beaaaf75a..ab938eea3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
+
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 23927f896..04f8ae899 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 7e0495247..3655443b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 59224bc65..6a2a642e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 2917ab5d0..5974a2212 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index ea651303e..c84e495cf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index f1b6c2762..ff6371c15 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 631b007f7..0cdc2d375 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 6bf62e163..0517654c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index e9d80dcba..5dc1e3bab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 629111cc2..66eaffcd5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 03a582a51..92af05353 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 8866842c5..2e385804d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 0fc722d97..98a64ebd6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index d7654bcdb..427f2b4b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index aa8b341c5..74a0ad136 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 14d6da36b..b9b2f4c8e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 2f4a65c57..f04438d2a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index f7f7bde51..62edc1a2e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3833d791c..34d6468ce 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index b2c7d4be1..c023c19de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index ab22cec47..dc133776e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 198837822..4a1db9bf0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 45d86f18a..9a8ace4a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index be4cceb0c..e12cd3fff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index af14ace8f..171ed578a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 00fbb2563..b442ee2da 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index e7c4b053e..9fb4d0631 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index c9d263f8f..71ee24859 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index da5ce48b5..6f4707b35 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 4cac3c509..02bdcc483 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index eacbac287..c8f566446 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index e33f52717..b55f1e153 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index c604204d2..f911866e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index f4623e664..887a47967 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index cb44bd3e6..3b3d764be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 0f0e5290d..dd2ea0a10 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 9b486ea34..a86b9a983 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 2154e1485..931d97d47 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 4d526353a..d7b05ee2e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index bc14f586d..ff4b486a0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 98567089a..e614c7365 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 26211bc69..187935111 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 72722bcf8..1d2f32df2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index c706a640c..fc33014b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 58107a965..84a2d66ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 2b2c794f5..c5ef23857 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index e8e3110f9..d5d35804a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index c50ad6f4e..31407a74f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 60e20d744..1537f93de 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index e4eeebfcb..b3904f851 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 4b54aa562..bdd98997f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 66e02cd50..698d72e95 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 1c42f4206..ad78bc332 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 46b4bd288..55b72d8fb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 2ec8996f4..e5d2cb44b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 5e2a114a7..ee7d81328 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 88ad1f8dd..68bcf15e3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index c536e0970..80021085e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 0c927196b..14d942165 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index e84f94f35..39ce50cda 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 94db8d5d9..6ba0e0550 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 61abbbf36..6d2e6831f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 2a7b8f256..ffcf316fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index d5b1bd180..e50bbb87f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -1,6 +1,6 @@
 
 /*
-  Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -11,6 +11,7 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
+
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,

From 7db2aa43112b04a61ea827a316a9896f35e24050 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Aug 2024 18:57:21 +0000
Subject: [PATCH 606/641] Change to generate.py to generate instances refences
 and uses the generated reference headers

---
 .../ck_tiled_fmha_batched_backward_bf16.cpp   | 106 +----
 .../ck_tiled_fmha_batched_backward_fp16.cpp   | 106 +----
 .../ck_tiled_fmha_batched_forward_bf16.cpp    |  74 +---
 .../ck_tiled_fmha_batched_forward_fp16.cpp    |  74 +---
 .../ck_tiled_fmha_batched_infer_bf16.cpp      |  74 +---
 .../ck_tiled_fmha_batched_infer_fp16.cpp      |  74 +---
 .../ck_tiled_fmha_grouped_backward_bf16.cpp   | 106 +----
 .../ck_tiled_fmha_grouped_backward_fp16.cpp   | 106 +----
 .../ck_tiled_fmha_grouped_forward_bf16.cpp    |  74 +---
 .../ck_tiled_fmha_grouped_forward_fp16.cpp    |  74 +---
 .../ck_tiled_fmha_grouped_infer_bf16.cpp      |  74 +---
 .../ck_tiled_fmha_grouped_infer_fp16.cpp      |  74 +---
 .../attention/hip_fmha/generate_instances.py  | 102 ++++-
 ...ha_batched_backward_bf16_instances_ref.hpp | 396 ++++++++++++++++++
 ...ha_batched_backward_fp16_instances_ref.hpp | 396 ++++++++++++++++++
 ...mha_batched_forward_bf16_instances_ref.hpp | 236 +++++++++++
 ...mha_batched_forward_fp16_instances_ref.hpp | 236 +++++++++++
 .../fmha_batched_infer_bf16_instances_ref.hpp | 236 +++++++++++
 .../fmha_batched_infer_fp16_instances_ref.hpp | 236 +++++++++++
 ...ha_grouped_backward_bf16_instances_ref.hpp | 396 ++++++++++++++++++
 ...ha_grouped_backward_fp16_instances_ref.hpp | 396 ++++++++++++++++++
 ...mha_grouped_forward_bf16_instances_ref.hpp | 236 +++++++++++
 ...mha_grouped_forward_fp16_instances_ref.hpp | 236 +++++++++++
 .../fmha_grouped_infer_bf16_instances_ref.hpp | 236 +++++++++++
 .../fmha_grouped_infer_fp16_instances_ref.hpp | 236 +++++++++++
 25 files changed, 3585 insertions(+), 1005 deletions(-)
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.hpp
 create mode 100644 xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.hpp

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
index fdec15de2..5352b9924 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
@@ -11,111 +11,7 @@
 #include "ck_tiled_fmha_batched_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_batched_backward_bf16_instances_ref.hpp"
 
 void batched_backward_bf16(BatchedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index e795eb9d3..a226bd5cc 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -11,111 +11,7 @@
 #include "ck_tiled_fmha_batched_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 32>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 64>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-extern template void run_batched_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 256>(
-    BatchedBackwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_batched_backward_fp16_instances_ref.hpp"
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
index e27552d3e..0dc988cd9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
@@ -11,79 +11,7 @@
 #include "ck_tiled_fmha_batched_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_batched_forward_bf16_instances_ref.hpp"
 
 void batched_forward_bf16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index a65f6a2a2..74ad4b74b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -11,79 +11,7 @@
 #include "ck_tiled_fmha_batched_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_batched_forward_fp16_instances_ref.hpp"
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
index b362a780f..1a0123196 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
@@ -10,79 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-// clang-format off
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_batched_infer_bf16_instances_ref.hpp"
 
 void batched_infer_bf16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index e55003c60..c21a9ad57 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -10,79 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-// clang-format off
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-extern template void run_batched_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
-    BatchedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_batched_infer_fp16_instances_ref.hpp"
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
index 4250bba47..51dd8a507 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
@@ -11,111 +11,7 @@
 #include "ck_tiled_fmha_grouped_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_grouped_backward_bf16_instances_ref.hpp"
 
 void grouped_backward_bf16(GroupedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index baca24387..6fa6f1be9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -11,111 +11,7 @@
 #include "ck_tiled_fmha_grouped_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 32>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 64>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 128>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, true, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, false, 256>(
-    GroupedBackwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_grouped_backward_fp16_instances_ref.hpp"
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
index e04af2e8a..ff14095fa 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
@@ -11,79 +11,7 @@
 #include "ck_tiled_fmha_grouped_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_grouped_forward_bf16_instances_ref.hpp"
 
 void grouped_forward_bf16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index 13276415e..1ac4c195b 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -11,79 +11,7 @@
 #include "ck_tiled_fmha_grouped_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-// clang-format off
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_grouped_forward_fp16_instances_ref.hpp"
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
index 5b0fb5b37..f780f7de1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
@@ -10,79 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-// clang-format off
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, false, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::bf16_t, true, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_grouped_infer_bf16_instances_ref.hpp"
 
 void grouped_infer_bf16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index fa0a407f1..e538029c5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -10,79 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-// clang-format off
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 32>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 64>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 128>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, true, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, false, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, true, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<ck_tile::fp16_t, true, false, false, 256>(
-    GroupedForwardParams& param, hipStream_t stream);
-// clang-format on
+#include "instances/fmha_grouped_infer_fp16_instances_ref.hpp"
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index 0975520ef..2fb6891b4 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -71,7 +71,7 @@
 FMHA_BACKWARD_INSTANCE_FNAME = "fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_"\
                                "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
-FMHA_INSTANCE_REF_FNAME = "fmha_{mode}_{function}_{dtype}.hpp"
+FMHA_INSTANCE_REF_FNAME = "instances/fmha_{mode}_{function}_{dtype}_instances_ref.hpp"
 
 BOOL_MAP = {
     True : "true",
@@ -153,6 +153,38 @@ def create_infer_instances(instance_dir: Path) -> None:
                             (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + infer_instance_inc + "\n" + infer_instance)
 
 
+def create_infer_instances_ref(instance_dir: Path) -> None:
+    for mode in ["batched", "grouped"]:
+        for dtype in ["fp16", "bf16"]:
+            ref_fname = FMHA_INSTANCE_REF_FNAME.format(
+                mode=mode,
+                function="infer",
+                dtype=dtype,
+            )
+            infer_instance_inc = FMHA_INFER_INSTANCE_TEMPLATE_INC.format(
+                mode=mode,
+                dtype_file=TYPE_FNAME_MAP[dtype],
+            )
+            with open(ref_fname, 'a') as file:
+                file.write(FMHA_COPYRIGHT_HEADER)
+                file.write(infer_instance_inc)
+                for max_k in [32, 64, 128, 256]:
+                    for has_bias in [True, False]:
+                        for has_dropout in [True, False]:
+                            for has_causalmask in [True, False]:
+                                infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
+                                    extern="extern ",
+                                    mode=mode,
+                                    dtype=TYPE_CTYPE_MAP[dtype],
+                                    has_causalmask=BOOL_MAP[has_causalmask],
+                                    has_bias=BOOL_MAP[has_bias],
+                                    has_dropout=BOOL_MAP[has_dropout],
+                                    max_k=max_k,
+                                    cap_mode=MODE_NAME_MAP[mode],
+                                )
+                                file.write(infer_instance)
+
+
 def create_forward_instances(instance_dir: Path) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
@@ -185,6 +217,38 @@ def create_forward_instances(instance_dir: Path) -> None:
                             (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + forward_instance_inc + "\n" + forward_instance)
 
 
+def create_forward_instances_ref(instance_dir: Path) -> None:
+    for mode in ["batched", "grouped"]:
+        for dtype in ["fp16", "bf16"]:
+            ref_fname = FMHA_INSTANCE_REF_FNAME.format(
+                mode=mode,
+                function="forward",
+                dtype=dtype,
+            )
+            forward_instance_inc = FMHA_FORWARD_INSTANCE_TEMPLATE_INC.format(
+                mode=mode,
+                dtype_file=TYPE_FNAME_MAP[dtype],
+            )
+            with open(ref_fname, 'a') as file:
+                file.write(FMHA_COPYRIGHT_HEADER)
+                file.write(forward_instance_inc)
+                for max_k in [32, 64, 128, 256]:
+                    for has_bias in [True, False]:
+                        for has_dropout in [True, False]:
+                            for has_causalmask in [True, False]:
+                                forward_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
+                                    extern="extern ",
+                                    mode=mode,
+                                    dtype=TYPE_CTYPE_MAP[dtype],
+                                    has_causalmask=BOOL_MAP[has_causalmask],
+                                    has_bias=BOOL_MAP[has_bias],
+                                    has_dropout=BOOL_MAP[has_dropout],
+                                    max_k=max_k,
+                                    cap_mode=MODE_NAME_MAP[mode],
+                                )
+                                file.write(forward_instance)
+
+
 def create_backward_instances(instance_dir: Path) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
@@ -219,10 +283,46 @@ def create_backward_instances(instance_dir: Path) -> None:
                             (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + backward_instance_inc + "\n" + backward_instance)
 
 
+def create_backward_instances_ref(instance_dir: Path) -> None:
+    for mode in ["batched", "grouped"]:
+        for dtype in ["fp16", "bf16"]:
+            ref_fname = FMHA_INSTANCE_REF_FNAME.format(
+                mode=mode,
+                function="backward",
+                dtype=dtype,
+            )
+            backward_instance_inc = FMHA_BACKWARD_INSTANCE_TEMPLATE_INC.format(
+                mode=mode,
+                dtype_file=TYPE_FNAME_MAP[dtype],
+            )
+            with open(ref_fname, 'a') as file:
+                file.write(FMHA_COPYRIGHT_HEADER)
+                file.write(backward_instance_inc)
+                for max_k in [32, 64, 128, 256]:
+                    for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
+                        for has_dropout in [True, False]:
+                            for has_causalmask in [True, False]:
+                                backward_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
+                                    extern="extern ",
+                                    mode=mode,
+                                    dtype=TYPE_CTYPE_MAP[dtype],
+                                    has_causalmask=BOOL_MAP[has_causalmask],
+                                    has_bias=BOOL_MAP[has_bias],
+                                    has_bias_grad=BOOL_MAP[has_bias_grad],
+                                    has_dropout=BOOL_MAP[has_dropout],
+                                    max_k=max_k,
+                                    cap_mode=MODE_NAME_MAP[mode],
+                                )
+                                file.write(backward_instance)
+
+
 if __name__ == "__main__":
     this_dir = os.path.dirname(__file__)
     output_dir = Path(this_dir) / "instances"
     output_dir.mkdir(parents=True, exist_ok=True)
     create_infer_instances(output_dir)
+    create_infer_instances_ref(output_dir)
     create_forward_instances(output_dir)
+    create_forward_instances_ref(output_dir)
     create_backward_instances(output_dir)
+    create_backward_instances_ref(output_dir)
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.hpp
new file mode 100644
index 000000000..06f82124a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.hpp
@@ -0,0 +1,396 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.hpp
new file mode 100644
index 000000000..d47f8cc1e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.hpp
@@ -0,0 +1,396 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_backward.h"
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    32>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    64>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    128>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
+
+extern template void run_batched_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(BatchedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.hpp
new file mode 100644
index 000000000..8fab725be
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_forward.h"
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.hpp
new file mode 100644
index 000000000..d69766972
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_forward.h"
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.hpp
new file mode 100644
index 000000000..003d76894
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_batched_infer.h"
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.hpp
new file mode 100644
index 000000000..266b3643e
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_batched_infer.h"
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    32>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    64>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    128>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
+
+extern template void run_batched_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    256>(BatchedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.hpp
new file mode 100644
index 000000000..870b4dda9
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.hpp
@@ -0,0 +1,396 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.hpp
new file mode 100644
index 000000000..367ca6bcf
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.hpp
@@ -0,0 +1,396 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_backward.h"
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    32>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    64>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    128>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    true,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_backward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    false,
+    256>(GroupedBackwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.hpp
new file mode 100644
index 000000000..4b1740f1a
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_forward.h"
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.hpp
new file mode 100644
index 000000000..2ac28a520
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_forward.h"
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_forward_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.hpp
new file mode 100644
index 000000000..aa5c84146
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/bfloat16.hpp>
+#include "ck_tiled_fmha_grouped_infer.h"
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    true,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::bf16_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.hpp
new file mode 100644
index 000000000..f3a5d8501
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.hpp
@@ -0,0 +1,236 @@
+
+/*
+  Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * The file is automatically generated, don't modify!
+ */
+
+#include <ck_tile/core/numeric/half.hpp>
+#include "ck_tiled_fmha_grouped_infer.h"
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    32>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    64>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    128>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    true,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    true,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    true,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);
+
+extern template void run_grouped_infer_causalmask_bias_dropout_dispatch<
+    ck_tile::fp16_t,
+    false,
+    false,
+    false,
+    256>(GroupedForwardParams& param, hipStream_t stream);

From 73dbf32a4f59751bef5730b4c861b4a5abbdc14f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 8 Aug 2024 07:14:57 +0000
Subject: [PATCH 607/641] Relax the RTOL of ckFwOp from 4e-4 to 3e-3 due to one
 big result case

---
 xformers/ops/fmha/ck.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 365ff76eb..47ad90d2f 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -173,7 +173,7 @@ class FwOp(AttentionFwOpBase):
     }
     ERROR_RTOL: Mapping[torch.dtype, float] = {
         torch.float: 2e-5,
-        torch.half: 4e-4,
+        torch.half: 3e-3,
         torch.bfloat16: 2e-2,
     }
 

From 0e6d0c3c6c963169139e2ab03b330b67e9a68bd0 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 12 Aug 2024 15:20:13 +0000
Subject: [PATCH 608/641] Change to use .h rather than .hpp as suffix for
 generated header files

---
 .../attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp  | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp  | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp   | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp   | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp     | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp     | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp  | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp  | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp   | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp   | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp     | 2 +-
 .../attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp     | 2 +-
 xformers/csrc/attention/hip_fmha/generate_instances.py          | 2 +-
 ...ances_ref.hpp => fmha_batched_backward_bf16_instances_ref.h} | 0
 ...ances_ref.hpp => fmha_batched_backward_fp16_instances_ref.h} | 0
 ...tances_ref.hpp => fmha_batched_forward_bf16_instances_ref.h} | 0
 ...tances_ref.hpp => fmha_batched_forward_fp16_instances_ref.h} | 0
 ...nstances_ref.hpp => fmha_batched_infer_bf16_instances_ref.h} | 0
 ...nstances_ref.hpp => fmha_batched_infer_fp16_instances_ref.h} | 0
 ...ances_ref.hpp => fmha_grouped_backward_bf16_instances_ref.h} | 0
 ...ances_ref.hpp => fmha_grouped_backward_fp16_instances_ref.h} | 0
 ...tances_ref.hpp => fmha_grouped_forward_bf16_instances_ref.h} | 0
 ...tances_ref.hpp => fmha_grouped_forward_fp16_instances_ref.h} | 0
 ...nstances_ref.hpp => fmha_grouped_infer_bf16_instances_ref.h} | 0
 ...nstances_ref.hpp => fmha_grouped_infer_fp16_instances_ref.h} | 0
 25 files changed, 13 insertions(+), 13 deletions(-)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_bf16_instances_ref.hpp => fmha_batched_backward_bf16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_backward_fp16_instances_ref.hpp => fmha_batched_backward_fp16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_bf16_instances_ref.hpp => fmha_batched_forward_bf16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_forward_fp16_instances_ref.hpp => fmha_batched_forward_fp16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_bf16_instances_ref.hpp => fmha_batched_infer_bf16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_batched_infer_fp16_instances_ref.hpp => fmha_batched_infer_fp16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_bf16_instances_ref.hpp => fmha_grouped_backward_bf16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_backward_fp16_instances_ref.hpp => fmha_grouped_backward_fp16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_bf16_instances_ref.hpp => fmha_grouped_forward_bf16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_forward_fp16_instances_ref.hpp => fmha_grouped_forward_fp16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_bf16_instances_ref.hpp => fmha_grouped_infer_bf16_instances_ref.h} (100%)
 rename xformers/csrc/attention/hip_fmha/instances/{fmha_grouped_infer_fp16_instances_ref.hpp => fmha_grouped_infer_fp16_instances_ref.h} (100%)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
index 5352b9924..3cf339b83 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_bf16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_batched_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_batched_backward_bf16_instances_ref.hpp"
+#include "instances/fmha_batched_backward_bf16_instances_ref.h"
 
 void batched_backward_bf16(BatchedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
index a226bd5cc..807169ccd 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward_fp16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_batched_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_batched_backward_fp16_instances_ref.hpp"
+#include "instances/fmha_batched_backward_fp16_instances_ref.h"
 
 void batched_backward_fp16(BatchedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
index 0dc988cd9..bd2e076e0 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_bf16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_batched_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_batched_forward_bf16_instances_ref.hpp"
+#include "instances/fmha_batched_forward_bf16_instances_ref.h"
 
 void batched_forward_bf16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
index 74ad4b74b..3c3791bdf 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_forward_fp16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_batched_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_batched_forward_fp16_instances_ref.hpp"
+#include "instances/fmha_batched_forward_fp16_instances_ref.h"
 
 void batched_forward_fp16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
index 1a0123196..23b04d935 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_bf16.cpp
@@ -10,7 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-#include "instances/fmha_batched_infer_bf16_instances_ref.hpp"
+#include "instances/fmha_batched_infer_bf16_instances_ref.h"
 
 void batched_infer_bf16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
index c21a9ad57..4e1d99e8e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer_fp16.cpp
@@ -10,7 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_batched_infer.h"
 
-#include "instances/fmha_batched_infer_fp16_instances_ref.hpp"
+#include "instances/fmha_batched_infer_fp16_instances_ref.h"
 
 void batched_infer_fp16(BatchedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
index 51dd8a507..7b77442be 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_bf16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_grouped_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_grouped_backward_bf16_instances_ref.hpp"
+#include "instances/fmha_grouped_backward_bf16_instances_ref.h"
 
 void grouped_backward_bf16(GroupedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
index 6fa6f1be9..be47bbdbb 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward_fp16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_grouped_backward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_grouped_backward_fp16_instances_ref.hpp"
+#include "instances/fmha_grouped_backward_fp16_instances_ref.h"
 
 void grouped_backward_fp16(GroupedBackwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
index ff14095fa..28d75ddc5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_bf16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_grouped_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_grouped_forward_bf16_instances_ref.hpp"
+#include "instances/fmha_grouped_forward_bf16_instances_ref.h"
 
 void grouped_forward_bf16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
index 1ac4c195b..31e28bad6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_forward_fp16.cpp
@@ -11,7 +11,7 @@
 #include "ck_tiled_fmha_grouped_forward.h"
 #include "ck_tiled_headdim_switch.h"
 
-#include "instances/fmha_grouped_forward_fp16_instances_ref.hpp"
+#include "instances/fmha_grouped_forward_fp16_instances_ref.h"
 
 void grouped_forward_fp16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
index f780f7de1..090227c1d 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_bf16.cpp
@@ -10,7 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-#include "instances/fmha_grouped_infer_bf16_instances_ref.hpp"
+#include "instances/fmha_grouped_infer_bf16_instances_ref.h"
 
 void grouped_infer_bf16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
index e538029c5..62c774ff5 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer_fp16.cpp
@@ -10,7 +10,7 @@
 #include "ck_tiled_bool_switch.h"
 #include "ck_tiled_fmha_grouped_infer.h"
 
-#include "instances/fmha_grouped_infer_fp16_instances_ref.hpp"
+#include "instances/fmha_grouped_infer_fp16_instances_ref.h"
 
 void grouped_infer_fp16(GroupedForwardParams& param, hipStream_t stream) {
   const bool has_dropout = (param.dropout_prob > 0.0f);
diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index 2fb6891b4..ff72c17bb 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -71,7 +71,7 @@
 FMHA_BACKWARD_INSTANCE_FNAME = "fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_"\
                                "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 
-FMHA_INSTANCE_REF_FNAME = "instances/fmha_{mode}_{function}_{dtype}_instances_ref.hpp"
+FMHA_INSTANCE_REF_FNAME = "instances/fmha_{mode}_{function}_{dtype}_instances_ref.h"
 
 BOOL_MAP = {
     True : "true",
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_instances_ref.h
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.hpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.h
similarity index 100%
rename from xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.hpp
rename to xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_instances_ref.h

From 914ccc582124c628784c8907c1cb33c3caa2bba4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 12 Aug 2024 15:24:32 +0000
Subject: [PATCH 609/641] Fix in .gitignore

---
 .gitignore | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8c6455c1b..b37d0b1b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,6 +67,7 @@ xformers/csrc/attention/hip_fmha/*.hip
 xformers/csrc/attention/hip_fmha/*_hip.h
 xformers/csrc/attention/hip_fmha/instances/*.cu
 xformers/csrc/attention/hip_fmha/instances/*.hip
-xformers/csrc/attention/hip_fmha/instances_tiled/*.cu
-xformers/csrc/attention/hip_fmha/instances_tiled/*.hip
+xformers/csrc/attention/hip_fmha/instances/*.cu
+xformers/csrc/attention/hip_fmha/instances/*.hip
+xformers/csrc/attention/hip_fmha/instances/*_hip.h
 

From 8503f87070cbadcd72d0004980d8c6c27f688d9f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 12 Aug 2024 15:26:55 +0000
Subject: [PATCH 610/641] Update to bwd setting to use only IGLP pipeline

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h     | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 64f16dbb5..96125d619 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -153,12 +153,6 @@ struct FmhaBwdShape<256> : ck_tile::TileFmhaBwdShape<
 
 template <ck_tile::index_t MaxK, bool kPadHeadDimQK, bool kPadHeadDimV>
 struct FmhaBwdPipelineEnumSelector {
-  static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
-      ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR;
-};
-
-template <ck_tile::index_t MaxK>
-struct FmhaBwdPipelineEnumSelector<MaxK, false, false> {
   static constexpr ck_tile::BlockFmhaBwdPipelineEnum value =
       ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP;
 };

From bfe164d191d8391a00de64d8d2ba8e83c1616f35 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 12 Aug 2024 15:46:00 +0000
Subject: [PATCH 611/641] Synchronize to latest ck_tile fix and align the
 headdim64 tile shape setting

---
 third_party/composable_kernel_tiled                |  2 +-
 .../attention/hip_fmha/ck_tiled_fmha_bwd_setting.h | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 0178da6f5..17c97f581 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 0178da6f5071171df3362bb9d419b4da0feb3765
+Subproject commit 17c97f581456dae128b7a6dddd9ec02dacedbd0e
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
index 96125d619..9e2ba4818 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_bwd_setting.h
@@ -64,10 +64,10 @@ struct FmhaBwdBlockTile<32> {
 
 template <>
 struct FmhaBwdBlockTile<64> {
-  using tile_lengths = ck_tile::sequence<64, 128, 64, 64, 64, 64, 64, 64, 64>;
+  using tile_lengths = ck_tile::sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>;
   using gemm02_warps = ck_tile::sequence<1, 4, 1>; // default for gemm0/gemm2
   using gemm13_warps = ck_tile::sequence<4, 1, 1>; // default for gemm1/gemm3
-  using gemm4_warps = ck_tile::sequence<2, 2, 1>; // default for gemm4
+  using gemm4_warps = ck_tile::sequence<1, 4, 1>; // default for gemm4
 };
 
 template <>
@@ -113,15 +113,15 @@ template <>
 struct FmhaBwdShape<64> : ck_tile::TileFmhaBwdShape<
                               typename FmhaBwdBlockTile<64>::tile_lengths,
                               typename FmhaBwdBlockTile<64>::gemm02_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile2,
                               typename FmhaBwdBlockTile<64>::gemm13_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile3,
                               typename FmhaBwdBlockTile<64>::gemm02_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile2,
                               typename FmhaBwdBlockTile<64>::gemm13_warps,
-                              FmhaBwdWarpTile1,
+                              FmhaBwdWarpTile3,
                               typename FmhaBwdBlockTile<64>::gemm4_warps,
-                              FmhaBwdWarpTile1> {};
+                              FmhaBwdWarpTile2> {};
 
 template <>
 struct FmhaBwdShape<128> : ck_tile::TileFmhaBwdShape<

From f75c3b27ea8d15cc845b3863dbfd386ca686bcdb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 12 Aug 2024 16:34:51 +0000
Subject: [PATCH 612/641] Reformat the generated instances cpp files

---
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...f16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...f16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...f16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...f16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...p16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...p16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...p16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...p16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...d_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...d_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...hed_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...hed_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...d_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...d_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...hed_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...hed_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...hed_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...hed_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...hed_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...hed_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...hed_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...hed_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...hed_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...hed_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...hed_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...hed_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...hed_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...hed_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...tched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...tched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...hed_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...hed_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...hed_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...hed_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...hed_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...hed_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...hed_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...hed_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...hed_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...hed_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...hed_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...hed_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...tched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...tched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...f16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...f16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...f16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...f16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...f16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...p16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...p16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ..._no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...6_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...p16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...p16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp | 1 -
 ...16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp | 1 -
 ...p16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp | 1 -
 ...fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp | 1 -
 ...fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp | 1 -
 ...forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ..._forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...d_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...d_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ..._forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...d_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...d_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ..._forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ..._forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...d_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...uped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...uped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...ped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...ped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...ped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...ped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...uped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...uped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...ped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...ped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...uped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...uped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...uped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...uped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...d_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...d_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...ped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...ped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...ped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...ped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...ped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...ped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...uped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...uped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 ...ed_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp | 1 -
 ...ed_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp | 1 -
 ...ped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp | 1 -
 ...ped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp | 1 -
 ...ped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp | 1 -
 ...ped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp | 1 -
 ...uped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp | 1 -
 ...uped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp | 1 -
 ...ped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp | 1 -
 ...ped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp | 1 -
 ...uped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp | 1 -
 ...uped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp | 1 -
 ...uped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp | 1 -
 ...uped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp | 1 -
 ...ouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp | 1 -
 ...ouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp | 1 -
 448 files changed, 448 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 39232e65d..b129b0719 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 76157bf99..58aaac801 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 4b774cf68..73360d7dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index c8ba202be..7f99b4819 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 6742fb592..b831c919d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index b0615cb13..1829f50f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index dc1dfba3e..74501e007 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 85560dae3..62a1c9d0b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 45ee4fd6d..b5b258196 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index cc4febe21..070e8b2c0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 77f5824dd..504c22609 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 0943e233c..573d9bf4b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 59206114f..67bf8995c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 1170edbe5..4bc3b5a83 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index fa0ad59b7..331b79140 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 4a14da080..1c3a956d4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 5c5af08af..0d902e120 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 1edf2b647..13dfd5a09 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index c13203a0c..e6b8fd85f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index edf535c0b..4c2c0672e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index b3a8f1a3b..68bac14f2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index d0475fb79..2a72588f1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 6d0f48867..ea7baeea2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 4d60a8589..202882678 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 0100f090f..8689b5389 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 1f3bb92cb..fd52bcc4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 04db3afad..2a5977be3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index e18a4bd4a..490659b74 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 5df78e1ec..f4f3ac89c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 323d799b5..4067c8e5a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 82b8af2ac..c3dd3d5fe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 573826492..d8fd52d7a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 3ba12bc99..f9e140aae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 5d0025622..71b1586ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 17ed22594..5688539e8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index fd4ba2dfd..a820ad76c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 4cb221876..fbd6b8b48 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 00091e827..b64b16b8d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 24eb9cf98..db6ee679c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 77008bcf5..e79dd63df 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 16c697a85..35a968405 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 9ee060f32..14d935611 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 16628b31b..783c741b6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 0c47e21db..7ddd65d11 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 65b0a11e8..69e698344 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 7e1d1835d..5fa39c880 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 52c1f82bf..fed439c70 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 3ae27d64c..6a955e982 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 6bda7dca0..b4df2bf40 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 62bb4da51..545a77955 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 2c6f31641..1da7bae3a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 85e8c719f..4c3cf7ff6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index dbfc26d1e..1cbafbf70 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index c18a7439a..f1e9009d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index b989377a5..951196506 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 0c0fe40d9..75fef6ab4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 537e9e0fa..836e9428e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index dece0aa4e..cf89aa7bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 79f162f27..bbc4eea82 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index d9c163f84..2d804bd5d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 37f622753..3b85cea79 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 1e312cf7f..f261d64ba 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 03cb14d16..635f9f1a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index fdd5cc6c5..919a01fb9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index ffa0b948e..bdf72b91a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index e77bb21e9..2588185d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index c0f9ee654..087b8e1c8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 082436890..d01cb1e37 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 478e39315..99a2823b4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 3c6658897..acceefffb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 58cb8d427..ac3a2a5fd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 04a808a3a..5a281913f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 6291955c3..68ffee4bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 3a445cab9..4d84693d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 05a23fe81..8b498600a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index eed061f45..7ddd6efd8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 04da2d7f9..d1bdf1fa5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 0971c2582..b8c8eb5b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 60ef436f6..60553e405 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 568c619f9..dafd1d5d2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 19e27101a..dd6ef7d00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index c13031bcd..daee39215 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index c9716e3a1..dc1971262 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index fb4b25492..e9c8d75e3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 045baff41..bc25646dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 5a9b9b630..a324ea3d1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 6e7b5e211..8ffe3a4c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 68ccee8e7..0d3ab043e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index d3dbae9d5..64c0c14fb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 8762b721b..2d0e3efaa 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index b85e7c5a5..003201abf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index d691bcaec..a6570b6bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 408729a17..a23a7087d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 50d564927..274405d53 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 5855ede14..46a8e8a4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index b329eeca0..5bdd29dbd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_backward.h"
 
-
 template void run_batched_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index bd85a5fdc..189677f41 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 2529a096f..39881bd0d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3bde17cb3..a24b8868a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 50ff42476..849a6633b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 44cd6d4d9..c49a96edb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 04934417a..f362ff83b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 29d774316..62205efbd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index f7a6fed93..c485fdfcd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 73e6a902a..68345b50d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index f199398c5..4e3144c61 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index bfac0e729..1654eb535 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index bdbb9f67a..fef0b43b9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index a02390265..87d8256c2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 6cf0c876c..521469e26 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index a4e1acd3b..d2eeed020 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 42c97b8cc..77e509f0c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index dd1b22159..b0898e658 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index c5cc1590f..aee8358c1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index ee0cc1d99..b949c5557 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 14142b105..3e28448d4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 275fd42c1..eae1bef14 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 5fd214297..3fea67a9d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 4decc0120..e9e1d8c03 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 3fd53bff1..0b5b5e9ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 1b2c2d743..20e880ae3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 4f27dd5af..2d9e145b8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index b6e8741bc..12c05851b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 3ab275d8b..296c93e84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 84a92844c..ffcd7f0d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index d381d7190..a0fbb353f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 37d55967f..729e834bf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index afc8a232f..b2ee36ac2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index faef825e7..e9c50c43e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 846e10f69..98ad34421 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 6b5be61df..df8cb489a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 84a34acd4..9ff6b6346 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 4ed15b231..8e5fc2b22 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 378ccb400..8489a8255 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 5b99bd861..0ab15f431 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index a43b7f87e..89b57dc00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 50627005f..286ce1f10 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index b98232fda..0a32ecd5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index b594cf6e4..5caa44509 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index f18fba3bc..7b45b7050 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 5ba04db66..ea683ccd0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 6828d19a0..c17397faf 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index e75c9823d..6483bd6da 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 49cce8e9d..607227078 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index cccd03ce4..1af052fb6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 73fff51b8..5616cdc52 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index d8ab68fa4..8b10f1192 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 807f27935..988a2fe2b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 5695adc9c..9b5b928f7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index fb68f8181..1b36a0d25 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index ba89bc3ee..785ecd397 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 3e3f6ec50..82199beb7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 10871d7ce..e18cda6c9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 56e2dce4b..ed23610a9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index b37f432d3..2e512e089 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 81962fc30..cfd204f04 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 56e6306f2..f161893bd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 11bcea176..c37fb70c9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 660e70185..f05aca856 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 69596971e..cd0f3d4ff 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_forward.h"
 
-
 template void run_batched_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index ebca11eb3..ad22843e3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 5601af4e0..a457b90f3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index daa20d691..51d21df17 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 0f5bbf5dc..0c2a21bf6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 884dffccd..4e33efc72 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 05d0edb57..f3eb7b0ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 40ee28738..d8db2ebe2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 9ad0b9fab..72e7fb412 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index a4e20b1cd..0b4ed8294 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 2132bab64..2e752c941 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 7933827a8..68366ee2f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 2bf8f82a1..9d0c50e13 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 2fbbf6236..8129cbf85 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index d1180dd33..3d6e897a4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 2c56e4e56..c264d95ad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index e079e0748..fb8e9fb0a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 0d9d667e1..db28d72f4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 2e0b100ee..228bb5397 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index b2712fce6..d0152e160 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 19321447a..8cb88dd94 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 8d33e6d0a..25c006c09 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 1a77a9ed2..77ab1fc3e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 14f62535a..15311470c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index ed8caf20d..4c98864b2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index a3b553aaa..d20c61ee1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index c645172e7..0410708e1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index a92504458..d837f7b54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index a6d9ec1ee..7462600fb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 2d3f4711c..65d1fd39a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 4e87793d6..c0ea4369a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index b627025e5..b46f0c0c8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index ff2957c10..8051de4d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index c5cf71b09..c1ee8c769 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 3cda93ebd..46a38e82d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index d99c733b4..6040d41cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index e0e604f1c..db5d5d577 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 9148a2624..ccc0a0254 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 45d96f13d..d81ff0d38 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index a0096a6d7..48b74b2bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index a16e08a30..fda07f6cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 5adffd056..43069dd54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 7004a13a6..bf8afd424 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index f8cad2c3a..351f5ea1d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 1270dd2ea..d06dc1f10 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 647c50792..df91366da 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index a85a5360d..4c292918b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 3c12b1e8a..9dc31e3ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index fa214ebf7..2bbd4f3dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 3d12babd5..37f18fd7d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 5231f0d2e..dd5ec2118 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 97c433883..3afe1c2f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index b744f412b..e9ddc972d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index e9701e2db..609b4981c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 075610634..5fca4f4ee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index ab5423bfd..fe3a2e2bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index 6a08c4772..d077701b9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 44a3a6a76..501a83e9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 8444c310e..d0b619f60 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 3cb04e9d3..af0bc1c85 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index ea7862776..578454c52 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 809acb6e9..d20d225cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 59c1812b0..ce76fd765 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 23c34e385..ca44ac6b0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 3f5085b29..5d7589a16 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_batched_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_batched_infer.h"
 
-
 template void run_batched_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index da52b4524..c22b793d3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 1e61eb1e1..f4b7a307a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 136309d34..c5b1454c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 06c6d3252..c8c71960d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 10edbf6c0..de55b8e88 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index 0c8ebfca6..577c43def 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index d43472c5c..9ffa70e78 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 2002eecd6..71ac1de6f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index ae5874ec2..f2baaf01d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 8436316d1..18d194062 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index fc0a04b31..8e87f044d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index f94f947a7..dbe7c0560 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 875c8acfd..7a293a973 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index ec424034e..dc5f5c749 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 75c82d385..8b878747f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 1ac2b6c68..1871a6cbe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 4d99c381d..295e3f403 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index b39de523c..e23b3c60b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 57bfe1e9b..08af2d667 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 671cf1f5e..4d2d7e78d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 8d8044832..43fc95070 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 646e3dc93..b85fa82e9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index e3be7a247..86d8d4776 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index aace93798..e8e862d54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 22e1faa7b..76a4e7dcb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 6f43a6f29..a4b3c633d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 00b6b1fe2..1ba22ae61 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index 8f635c6a9..07813b2c5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 6ce4770a8..42818cfa9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index b19238d3e..07b019af4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index cfc040870..485b64775 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 57280c0f3..ac1bccc14 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 38106aded..65b67988a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index a98415a60..81616d6af 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 142824508..9fc0a6c62 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 6d9ce7550..dfbcd25be 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 9f4f7944b..8650510c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 05a9e830c..261017c52 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 469f7ee4a..842c071d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index bc76b94c5..1bf3602e3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index a504db1c5..302c566e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 8a5e31b51..c3f030c5f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index f5c628c18..070e74116 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 2bc167aa7..8011c547d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index b06c9143e..249bf2a54 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index a03c7b019..9fed2aefc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 542c82ac2..224d5f1bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 02c6caf0c..43fea8dee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_bf16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index 647dfed39..dc70813fc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 8408f10e5..10ae8c302 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 6f6baa130..4fdbb099c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index fba9304bf..e5d4365a1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index c319c597a..e028d1bee 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index e3740d923..3c47d406b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index e630b82b3..1651af366 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index adaf82000..28fcbfad6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index ac94963de..34b227fad 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 39d892476..ccd459e84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 508db91ec..20033dee2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index b83f716fd..c9dece923 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index 864c54707..3b71014f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index b3c02ddb1..09ac8a84e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index dd433cf6b..62df2f2dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 2b8bbd000..07514352b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index c23499359..c0d222f05 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 3e7281c9f..8d32e0b35 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index f2bcef822..fe11f7f00 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 2c17644ac..45ba2ddd3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index fa7b75bad..e8e20cb4d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index 8b8d3e18c..81668563e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index e4f6da1fd..1961a1a29 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 03ce989bd..ba07be603 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_has_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
index dc4d9bce7..15e2f31d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
index 3197e15f4..00effd83c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
index 7707a22ba..de4030074 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
index ec91dbaff..756c1dc18 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
index 3d57e18f1..7c5978f3f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
index c851179fe..1dd5dfa0f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
index 3e0b2cefa..69ebd5833 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
index 6630c3d74..3218e1606 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_has_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
index 18683ea06..831e8b9ac 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
index cf38ccdd0..d7aeb937f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
index 67e7fc14c..2659f809d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
index e4cb050b1..466834030 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
index a6f62c5ec..dc7f41755 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
index faf27d95b..8d1366511 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
index e7552bea0..07e60021b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 43e0658b3..d562c0384 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_has_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
index ff26b66be..3b38e48f6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
index 76a5236c1..cc9c0e377 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
index cbb0cdf16..7237f3cab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
index 7277f375a..7f7b87b46 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
index e1b1d55d6..fca2defab 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
index bff058814..247d2933f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
index 9d0eb19ae..952d91a05 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
index 80e3e5d31..df612447f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_backward_fp16_no_causalmask_no_bias_no_biasgrad_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_backward.h"
 
-
 template void run_grouped_backward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 8d3f1699a..436b35249 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 872b8feb9..673ace243 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index e7e556194..12f2dce03 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index fad634dd7..b05db1117 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index 1cee53160..ac8a014bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index b11085627..2bb41cd3b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 78f288862..8c17a20b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 14a9250aa..58357d0f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index ea0d4e867..6b03e2ffd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 3eae57ea0..b98a212b3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index de9de2f4b..ba57b065d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index f0309768d..6b5463311 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 716e34fe4..c1b145ccd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index f4982d3b6..ea2ee5082 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index f8bb2bf07..2b9b0559f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index ba9874ee7..6bad209f7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 0f9de6935..222d1ed50 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 74ac7d90c..bcad83e85 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index dfd68d087..249011ee1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 0d83cb462..15ac9062f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 008d2e68f..4b833c8f8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 254abd1fa..3e07c1050 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 38b336e01..276962324 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index efc6e40de..f43d7b41c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 49924fbf2..1da0732d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index ef83ee445..4891094bc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 535f3877a..d20de70d8 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index a89bc6bb4..2e552a997 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 1276d65a6..85f9097f5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 4a36334e4..456ae223a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 3505c9a97..51cbbf71d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index 169fec04a..0614b84a2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index ce25186d3..6db568b7c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index f9633bbfd..7c14a9f97 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index e5292f882..3ad15a89c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index aa89d62e8..a0431622e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index c34d945e0..3c5f652c7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 67690c1e5..562298f72 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index d332e50ea..9daf7f6c6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 6c9735dd1..1f3b70c84 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 9b0e515e5..1ce708426 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 8a6aac9d4..f765d967b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 91d7974f7..65a976a9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index ac69a855e..30b56e1b1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 938d8a2ef..22ece8289 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 9f3432708..d5a7778e5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 1f5470478..bc5553560 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 8f30d330e..4b74c49ef 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 65fc8ffe9..b0918f683 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 35b9221d9..432cdd978 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 9c598402e..b7f09b7c3 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 08ae9091b..8c6ad2498 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index 6c295a8f9..2b747e5e2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index f1345945f..0d7c558cd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 6c212f9ad..3efca3798 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index d934dad1b..dae892ab7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 36e76fb54..d2020485e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 56ada7742..a29929b80 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index fdc02134c..d5f3cdffe 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index c38442bc5..6a7482d69 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index c31359a77..fc5604b5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index b57f76adb..f8741ae4f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index 377af2368..8c4e8581b 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index ab938eea3..b29ac4d4f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_forward_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_forward.h"
 
-
 template void run_grouped_forward_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index 04f8ae899..52e1d5d71 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 3655443b7..055b769f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 6a2a642e7..9ce3756a6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index 5974a2212..46d4e69b7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index c84e495cf..5f11a042f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index ff6371c15..3134e1c4c 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index 0cdc2d375..f858eccb5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index 0517654c3..5da3272f0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index 5dc1e3bab..ed632d7ea 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 66eaffcd5..d336cc52d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 92af05353..7095195dd 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index 2e385804d..312a64a29 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 98a64ebd6..5747867dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index 427f2b4b6..f54dadca5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index 74a0ad136..a6b637a29 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index b9b2f4c8e..47abe27d9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index f04438d2a..95eb7e0ed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index 62edc1a2e..e9c361bd0 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index 34d6468ce..5530bb928 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index c023c19de..0a5592615 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index dc133776e..5949924e4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 4a1db9bf0..4ed017906 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index 9a8ace4a0..d5df90946 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index e12cd3fff..8be8afd5e 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 171ed578a..441603639 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index b442ee2da..39e2f9fed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 9fb4d0631..6172df88a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 71ee24859..41681f180 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 6f4707b35..98625d142 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 02bdcc483..9d3d73288 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index c8f566446..bb537cfe2 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index b55f1e153..66769f244 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_bf16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/bfloat16.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::bf16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
index f911866e5..4c35127f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
index 887a47967..12a2a6105 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
index 3b3d764be..885584ef4 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
index dd2ea0a10..a11af5773 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
index a86b9a983..8d1f0fb7f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
index 931d97d47..50577f7f9 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
index d7b05ee2e..07fcfd2eb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
index ff4b486a0..dc3690344 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
index e614c7365..b3727732a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
index 187935111..b8cb89622 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
index 1d2f32df2..a4c2cacf1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
index fc33014b3..2b36d6f33 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
index 84a2d66ae..f3827c240 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
index c5ef23857..6627919bb 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
index d5d35804a..793fc5c90 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
index 31407a74f..2d50423e7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_has_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     true,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
index 1537f93de..ffb1b36d6 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
index b3904f851..db5416d92 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
index bdd98997f..d5cce31a7 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
index 698d72e95..bb3ad0e57 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
index ad78bc332..2f6366584 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
index 55b72d8fb..aed425ba5 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
index e5d2cb44b..c3678b42f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
index ee7d81328..7481a9b9a 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_has_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
index 68bcf15e3..f6282217d 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
index 80021085e..0564af6ec 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
index 14d942165..afbe9a21f 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
index 39ce50cda..99e9133dc 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_has_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
index 6ba0e0550..637d40bc1 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_128.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
index 6d2e6831f..ca8cb1bed 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_256.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
index ffcf316fd..61f1540ae 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_32.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,
diff --git a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
index e50bbb87f..cad791039 100644
--- a/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
+++ b/xformers/csrc/attention/hip_fmha/instances/fmha_grouped_infer_fp16_no_causalmask_no_bias_no_dropout_maxk_64.cpp
@@ -11,7 +11,6 @@
 #include <ck_tile/core/numeric/half.hpp>
 #include "ck_tiled_fmha_grouped_infer.h"
 
-
 template void run_grouped_infer_causalmask_bias_dropout_dispatch<
     ck_tile::fp16_t,
     false,

From bc3db994cfc5a400cf47967ce2f09eb31608a39f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 13 Aug 2024 17:53:05 +0000
Subject: [PATCH 613/641] Fix to the backward Trait

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h     | 1 +
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 502ab4e9e..8bcb29bee 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -126,6 +126,7 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
               kBiasEnum,
               kHasBiasGrad,
               false, // kStoreLSE
+              false, // place-holder for kHasDropout, not used actually
               false, // kDoFp8StaticQuant place-holder
               occupancy>;
 
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 5ca27a0c5..82d9920f6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -123,6 +123,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
               kBiasEnum,
               kHasBiasGrad,
               false, // kStoreLSE
+              false, // place-holder for kHasDropout, not used actually
               false, // kDoFp8StaticQuant place-holder
               occupancy>;
 

From fa6d8b3a63c9d7e0d1d0183d45be6bba17c36edb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 13 Aug 2024 18:08:28 +0000
Subject: [PATCH 614/641] Set occupancy to -1 to avoid the compiling warning

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h    | 2 +-
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 8bcb29bee..6804ce6d6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -96,7 +96,7 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
       const bool has_local_attention = (param.window_size > 0) ? true : false;
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-        constexpr ck_tile::index_t occupancy = 1;
+        constexpr ck_tile::index_t occupancy = -1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
         using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index 82d9920f6..d2ba13a31 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -92,7 +92,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       const bool has_local_attention = (param.window_size > 0) ? true : false;
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-        constexpr ck_tile::index_t occupancy = 1;
+        constexpr ck_tile::index_t occupancy = -1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 

From c5c7cce9e68881949a7607f3645edf083cf3feca Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 13 Aug 2024 18:57:39 +0000
Subject: [PATCH 615/641] Revert "Set occupancy to -1 to avoid the compiling
 warning"

This reverts commit fa6d8b3a63c9d7e0d1d0183d45be6bba17c36edb.
---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h    | 2 +-
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
index 6804ce6d6..8bcb29bee 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_backward.h
@@ -96,7 +96,7 @@ struct batched_backward_causalmask_bias_dropout_dispatch {
       const bool has_local_attention = (param.window_size > 0) ? true : false;
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-        constexpr ck_tile::index_t occupancy = -1;
+        constexpr ck_tile::index_t occupancy = 1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
 
         using FmhaMask = ck_tile::SimplifiedGenericAttentionMask<has_masking>;
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
index d2ba13a31..82d9920f6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_backward.h
@@ -92,7 +92,7 @@ struct grouped_backward_causalmask_bias_dropout_dispatch {
       const bool has_local_attention = (param.window_size > 0) ? true : false;
 
       BOOL_SWITCH(has_local_attention, USE_LOCAL_ATTENTION, [&] {
-        constexpr ck_tile::index_t occupancy = -1;
+        constexpr ck_tile::index_t occupancy = 1;
         constexpr bool has_masking = kHasCausalMask || USE_LOCAL_ATTENTION;
         const bool has_dropout = (param.dropout_prob > 0.0f);
 

From d230433eafebdfe06824ee560475efcd39cec2a0 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 14 Aug 2024 17:02:32 +0000
Subject: [PATCH 616/641] Add environment variable and compiler definition to
 control the generating of headdim256 instances

---
 setup.py                                      | 10 +++++
 .../hip_fmha/ck_tiled_headdim_switch.h        | 42 +++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/setup.py b/setup.py
index 54a261f66..6520f049d 100644
--- a/setup.py
+++ b/setup.py
@@ -402,6 +402,14 @@ def get_extensions():
             "--ptxas-options=-allow-expensive-optimizations=true",
         ]
     elif torch.cuda.is_available() and torch.version.hip:
+        disable_hd256_hip_fmha = os.getenv("DISABLE_HD256_HIP_FMHA", "0")
+        if disable_hd256_hip_fmha == "1":
+            source_hip_maxk_256 = []
+            for ff in source_hip:
+                if ff.endswith("maxk_256.cpp"):
+                    source_hip_maxk_256 += [ff]
+            source_hip = list(set(source_hip) - set(source_hip_maxk_256))
+
         rename_cpp_cu(source_hip)
         rocm_home = os.getenv("ROCM_PATH")
         hip_version = get_hip_version(rocm_home)
@@ -421,6 +429,8 @@ def get_extensions():
         ]
 
         generator_flag = []
+        if disable_hd256_hip_fmha == "1":
+            generator_flag += ["-DFMHA_SUPPORT_MAX_HEADDIM_128=1"]
 
         cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
         extra_compile_args = {
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
index 3e435a646..ce99023c9 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_headdim_switch.h
@@ -9,6 +9,46 @@
 #include <ck_tile/core.hpp>
 #include <stdexcept>
 
+#ifndef FMHA_SUPPORT_MAX_HEADDIM_128
+#define FMHA_SUPPORT_MAX_HEADDIM_128 0
+#endif
+
+#if FMHA_SUPPORT_MAX_HEADDIM_128
+
+#define FMHA_FWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
+  [&] {                                                                \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
+      constexpr ck_tile::index_t CONST_NAME = 32;                      \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {                   \
+      constexpr ck_tile::index_t CONST_NAME = 64;                      \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
+      constexpr ck_tile::index_t CONST_NAME = 128;                     \
+      __VA_ARGS__();                                                   \
+    } else {                                                           \
+      throw std::runtime_error("Head-dim sizes not supported!");       \
+    }                                                                  \
+  }()
+
+#define FMHA_BWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
+  [&] {                                                                \
+    if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
+      constexpr ck_tile::index_t CONST_NAME = 32;                      \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 64 && HEAD_DIM2 <= 64) {                   \
+      constexpr ck_tile::index_t CONST_NAME = 64;                      \
+      __VA_ARGS__();                                                   \
+    } else if (HEAD_DIM1 <= 128 && HEAD_DIM2 <= 128) {                 \
+      constexpr ck_tile::index_t CONST_NAME = 128;                     \
+      __VA_ARGS__();                                                   \
+    } else {                                                           \
+      throw std::runtime_error("Head-dim sizes not supported!");       \
+    }                                                                  \
+  }()
+
+#else
+
 #define FMHA_FWD_HEADDIM_SWITCH(HEAD_DIM1, HEAD_DIM2, CONST_NAME, ...) \
   [&] {                                                                \
     if (HEAD_DIM1 <= 32 && HEAD_DIM2 <= 32) {                          \
@@ -46,3 +86,5 @@
       throw std::runtime_error("Head-dim sizes not supported!");       \
     }                                                                  \
   }()
+
+#endif

From 82a07aeab231c85a7280b8e88482b4d0a2930dcb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 14 Aug 2024 17:54:11 +0000
Subject: [PATCH 617/641] Add --ignore-hd256 argument to generate_instance.py
 and some update in this script

---
 .../attention/hip_fmha/generate_instances.py  | 62 ++++++++++++-------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index ff72c17bb..fc27bcc54 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -6,7 +6,9 @@
 #
 
 import os
+import sys
 from pathlib import Path
+from typing import List
 
 FMHA_COPYRIGHT_HEADER = """
 /*
@@ -121,13 +123,13 @@
 }
 
 
-def create_infer_instances(instance_dir: Path) -> None:
+def create_infer_instances(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             for has_causalmask in [True, False]:
                 for has_bias in [True, False]:
                     for has_dropout in [True, False]:
-                        for max_k in [32, 64, 128, 256]:
+                        for max_k in headdims:
                             fname = FMHA_INFER_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
@@ -150,10 +152,10 @@ def create_infer_instances(instance_dir: Path) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + infer_instance_inc + "\n" + infer_instance)
+                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + infer_instance_inc + infer_instance)
 
 
-def create_infer_instances_ref(instance_dir: Path) -> None:
+def create_infer_instances_ref(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             ref_fname = FMHA_INSTANCE_REF_FNAME.format(
@@ -168,7 +170,7 @@ def create_infer_instances_ref(instance_dir: Path) -> None:
             with open(ref_fname, 'a') as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(infer_instance_inc)
-                for max_k in [32, 64, 128, 256]:
+                for max_k in headdims:
                     for has_bias in [True, False]:
                         for has_dropout in [True, False]:
                             for has_causalmask in [True, False]:
@@ -185,13 +187,13 @@ def create_infer_instances_ref(instance_dir: Path) -> None:
                                 file.write(infer_instance)
 
 
-def create_forward_instances(instance_dir: Path) -> None:
+def create_forward_instances(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             for has_causalmask in [True, False]:
                 for has_bias in [True, False]:
                     for has_dropout in [True, False]:
-                        for max_k in [32, 64, 128, 256]:
+                        for max_k in headdims:
                             fname = FMHA_FORWARD_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
@@ -214,10 +216,10 @@ def create_forward_instances(instance_dir: Path) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + forward_instance_inc + "\n" + forward_instance)
+                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + forward_instance_inc + forward_instance)
 
 
-def create_forward_instances_ref(instance_dir: Path) -> None:
+def create_forward_instances_ref(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             ref_fname = FMHA_INSTANCE_REF_FNAME.format(
@@ -232,7 +234,7 @@ def create_forward_instances_ref(instance_dir: Path) -> None:
             with open(ref_fname, 'a') as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(forward_instance_inc)
-                for max_k in [32, 64, 128, 256]:
+                for max_k in headdims:
                     for has_bias in [True, False]:
                         for has_dropout in [True, False]:
                             for has_causalmask in [True, False]:
@@ -249,13 +251,13 @@ def create_forward_instances_ref(instance_dir: Path) -> None:
                                 file.write(forward_instance)
 
 
-def create_backward_instances(instance_dir: Path) -> None:
+def create_backward_instances(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             for has_causalmask in [True, False]:
                 for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
                     for has_dropout in [True, False]:
-                        for max_k in [32, 64, 128, 256]:
+                        for max_k in headdims:
                             fname = FMHA_BACKWARD_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
@@ -280,10 +282,10 @@ def create_backward_instances(instance_dir: Path) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + backward_instance_inc + "\n" + backward_instance)
+                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + backward_instance_inc + backward_instance)
 
 
-def create_backward_instances_ref(instance_dir: Path) -> None:
+def create_backward_instances_ref(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             ref_fname = FMHA_INSTANCE_REF_FNAME.format(
@@ -298,7 +300,7 @@ def create_backward_instances_ref(instance_dir: Path) -> None:
             with open(ref_fname, 'a') as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(backward_instance_inc)
-                for max_k in [32, 64, 128, 256]:
+                for max_k in headdims:
                     for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
                         for has_dropout in [True, False]:
                             for has_causalmask in [True, False]:
@@ -317,12 +319,30 @@ def create_backward_instances_ref(instance_dir: Path) -> None:
 
 
 if __name__ == "__main__":
+    disable_hd256 = False
+
+    for arg in sys.argv:
+        if arg == "--ignore-hd256":
+            disable_hd256 = True
+
+    if disable_hd256:
+        headdims = [32, 64, 128]
+    else:
+        headdims = [32, 64, 128, 256]
+
     this_dir = os.path.dirname(__file__)
     output_dir = Path(this_dir) / "instances"
     output_dir.mkdir(parents=True, exist_ok=True)
-    create_infer_instances(output_dir)
-    create_infer_instances_ref(output_dir)
-    create_forward_instances(output_dir)
-    create_forward_instances_ref(output_dir)
-    create_backward_instances(output_dir)
-    create_backward_instances_ref(output_dir)
+
+    ## remove existing files in the directory
+    files = os.listdir(output_dir)
+    for ff in files:
+        file_path = os.path.join(output_dir, ff)
+        os.remove(file_path)
+
+    create_infer_instances(output_dir, headdims)
+    create_infer_instances_ref(output_dir, headdims)
+    create_forward_instances(output_dir, headdims)
+    create_forward_instances_ref(output_dir, headdims)
+    create_backward_instances(output_dir, headdims)
+    create_backward_instances_ref(output_dir, headdims)

From 38593d606ab1cdf8b58f94ef02e7c1cda86e20d1 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 15 Aug 2024 09:19:04 +0000
Subject: [PATCH 618/641] Add environment variable
 ENABLE_HIP_FMHA_RTN_BF16_CONVERT to enable using rtn bf16 conversion

---
 setup.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/setup.py b/setup.py
index 6520f049d..f648706e2 100644
--- a/setup.py
+++ b/setup.py
@@ -428,11 +428,16 @@ def get_extensions():
             Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
         ]
 
+        use_rtn_bf16_convert = os.getenv("ENABLE_HIP_FMHA_RTN_BF16_CONVERT", "0")
+
         generator_flag = []
         if disable_hd256_hip_fmha == "1":
             generator_flag += ["-DFMHA_SUPPORT_MAX_HEADDIM_128=1"]
 
         cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
+        if use_rtn_bf16_convert == "1":
+            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0"]
+
         extra_compile_args = {
             "cxx": ["-O3", "-std=c++17"] + generator_flag,
             "nvcc": [

From 15dc91180912f895512f5784f9b89df51504243c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 15 Aug 2024 17:47:20 +0000
Subject: [PATCH 619/641] Remove commented lines in test_mem_eff_attention.py

---
 tests/test_mem_eff_attention.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index d42d4cc22..ed6d6a696 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -705,10 +705,6 @@ def test_backward(
 
     if op_bw == fmha.ck.BwOp:
         op_fw = fmha.ck.FwOp
-        ##if dtype == torch.bfloat16:
-        ##    pytest.skip(
-        ##        "CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
-        ##    )
         if grad_out_contiguous is False:
             pytest.skip("CK Fmha does not support contiguous layout for grad_out!")
 

From 367274c13ee5930b27b031f0640a66be1ff6d3ba Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 15 Aug 2024 22:42:56 +0000
Subject: [PATCH 620/641] Synchronize to latest ck_tile commit

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 17c97f581..0d79fde5e 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 17c97f581456dae128b7a6dddd9ec02dacedbd0e
+Subproject commit 0d79fde5e2bb4009de31a63ce1f8ec1facf4c1cc

From f7b28c52a9b00aed07819266a4d54b899e92eb3f Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Aug 2024 19:45:57 +0000
Subject: [PATCH 621/641] apply black

---
 setup.py                                      |   2 +-
 .../attention/hip_fmha/generate_instances.py  | 175 +++++++++++-------
 xformers/ops/fmha/ck.py                       |   4 +-
 3 files changed, 110 insertions(+), 71 deletions(-)

diff --git a/setup.py b/setup.py
index f648706e2..abadb4a17 100644
--- a/setup.py
+++ b/setup.py
@@ -451,7 +451,7 @@ def get_extensions():
                 "-Werror",
                 "-Woverloaded-virtual",
                 "-mllvm",
-                "-enable-post-misched=0"
+                "-enable-post-misched=0",
             ]
             + generator_flag
             + cc_flag,
diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index fc27bcc54..bfbe5f345 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -35,8 +35,10 @@
     {max_k}>({cap_mode}ForwardParams& param, hipStream_t stream);
 """
 
-FMHA_INFER_INSTANCE_FNAME = "fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_"\
-                            "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+FMHA_INFER_INSTANCE_FNAME = (
+    "fmha_{mode}_infer_{dtype_str}_{has_or_no_causalmask_str}_"
+    "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+)
 
 FMHA_FORWARD_INSTANCE_TEMPLATE_INC = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
@@ -52,8 +54,10 @@
     {max_k}>({cap_mode}ForwardParams& param, hipStream_t stream);
 """
 
-FMHA_FORWARD_INSTANCE_FNAME = "fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_"\
-                              "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+FMHA_FORWARD_INSTANCE_FNAME = (
+    "fmha_{mode}_forward_{dtype_str}_{has_or_no_causalmask_str}_"
+    "{has_or_no_bias_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+)
 
 FMHA_BACKWARD_INSTANCE_TEMPLATE_INC = """
 #include <ck_tile/core/numeric/{dtype_file}.hpp>
@@ -70,56 +74,55 @@
     {max_k}>({cap_mode}BackwardParams& param, hipStream_t stream);
 """
 
-FMHA_BACKWARD_INSTANCE_FNAME = "fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_"\
-                               "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+FMHA_BACKWARD_INSTANCE_FNAME = (
+    "fmha_{mode}_backward_{dtype_str}_{has_or_no_causalmask_str}_"
+    "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
+)
 
 FMHA_INSTANCE_REF_FNAME = "instances/fmha_{mode}_{function}_{dtype}_instances_ref.h"
 
-BOOL_MAP = {
-    True : "true",
-    False : "false"
-}
+BOOL_MAP = {True: "true", False: "false"}
 
 BOOL_MAP_CAUSALMASK = {
-    True : "has_causalmask",
-    False : "no_causalmask",
+    True: "has_causalmask",
+    False: "no_causalmask",
 }
 
 BOOL_MAP_BIAS = {
-    True : "has_bias",
-    False : "no_bias",
+    True: "has_bias",
+    False: "no_bias",
 }
 
 BOOL_MAP_BIASGRAD = {
-    True : "has_biasgrad",
-    False : "no_biasgrad",
+    True: "has_biasgrad",
+    False: "no_biasgrad",
 }
 
 BOOL_MAP_DROPOUT = {
-    True : "has_dropout",
-    False : "no_dropout",
+    True: "has_dropout",
+    False: "no_dropout",
 }
 
 INT_MAP_MAX_K = {
-    32 : "maxk_32",
-    64 : "maxk_64",
-    128 : "maxk_128",
-    256 : "maxk_256",
+    32: "maxk_32",
+    64: "maxk_64",
+    128: "maxk_128",
+    256: "maxk_256",
 }
 
 TYPE_CTYPE_MAP = {
-    "fp16" : "ck_tile::fp16_t",
-    "bf16" : "ck_tile::bf16_t",
+    "fp16": "ck_tile::fp16_t",
+    "bf16": "ck_tile::bf16_t",
 }
 
 TYPE_FNAME_MAP = {
-    "fp16" : "half",
-    "bf16" : "bfloat16",
+    "fp16": "half",
+    "bf16": "bfloat16",
 }
 
 MODE_NAME_MAP = {
-    "batched" : "Batched",
-    "grouped" : "Grouped",
+    "batched": "Batched",
+    "grouped": "Grouped",
 }
 
 
@@ -133,14 +136,18 @@ def create_infer_instances(instance_dir: Path, headdims: List) -> None:
                             fname = FMHA_INFER_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
-                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[
+                                    has_causalmask
+                                ],
                                 has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
                                 has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
                                 max_k_str=INT_MAP_MAX_K[max_k],
                             )
-                            infer_instance_inc = FMHA_INFER_INSTANCE_TEMPLATE_INC.format(
-                                mode=mode,
-                                dtype_file=TYPE_FNAME_MAP[dtype],
+                            infer_instance_inc = (
+                                FMHA_INFER_INSTANCE_TEMPLATE_INC.format(
+                                    mode=mode,
+                                    dtype_file=TYPE_FNAME_MAP[dtype],
+                                )
                             )
                             infer_instance = FMHA_INFER_INSTANCE_TEMPLATE.format(
                                 extern="",
@@ -152,7 +159,11 @@ def create_infer_instances(instance_dir: Path, headdims: List) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + infer_instance_inc + infer_instance)
+                            (instance_dir / fname).write_text(
+                                FMHA_COPYRIGHT_HEADER
+                                + infer_instance_inc
+                                + infer_instance
+                            )
 
 
 def create_infer_instances_ref(instance_dir: Path, headdims: List) -> None:
@@ -167,7 +178,7 @@ def create_infer_instances_ref(instance_dir: Path, headdims: List) -> None:
                 mode=mode,
                 dtype_file=TYPE_FNAME_MAP[dtype],
             )
-            with open(ref_fname, 'a') as file:
+            with open(ref_fname, "a") as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(infer_instance_inc)
                 for max_k in headdims:
@@ -197,15 +208,19 @@ def create_forward_instances(instance_dir: Path, headdims: List) -> None:
                             fname = FMHA_FORWARD_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
-                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[
+                                    has_causalmask
+                                ],
                                 has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
                                 has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
                                 max_k_str=INT_MAP_MAX_K[max_k],
                             )
-                            forward_instance_inc = FMHA_FORWARD_INSTANCE_TEMPLATE_INC.format(
-                                mode=mode,
-                                dtype_file=TYPE_FNAME_MAP[dtype],
-                            ) 
+                            forward_instance_inc = (
+                                FMHA_FORWARD_INSTANCE_TEMPLATE_INC.format(
+                                    mode=mode,
+                                    dtype_file=TYPE_FNAME_MAP[dtype],
+                                )
+                            )
                             forward_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
                                 extern="",
                                 mode=mode,
@@ -216,7 +231,11 @@ def create_forward_instances(instance_dir: Path, headdims: List) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + forward_instance_inc + forward_instance)
+                            (instance_dir / fname).write_text(
+                                FMHA_COPYRIGHT_HEADER
+                                + forward_instance_inc
+                                + forward_instance
+                            )
 
 
 def create_forward_instances_ref(instance_dir: Path, headdims: List) -> None:
@@ -231,22 +250,24 @@ def create_forward_instances_ref(instance_dir: Path, headdims: List) -> None:
                 mode=mode,
                 dtype_file=TYPE_FNAME_MAP[dtype],
             )
-            with open(ref_fname, 'a') as file:
+            with open(ref_fname, "a") as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(forward_instance_inc)
                 for max_k in headdims:
                     for has_bias in [True, False]:
                         for has_dropout in [True, False]:
                             for has_causalmask in [True, False]:
-                                forward_instance = FMHA_FORWARD_INSTANCE_TEMPLATE.format(
-                                    extern="extern ",
-                                    mode=mode,
-                                    dtype=TYPE_CTYPE_MAP[dtype],
-                                    has_causalmask=BOOL_MAP[has_causalmask],
-                                    has_bias=BOOL_MAP[has_bias],
-                                    has_dropout=BOOL_MAP[has_dropout],
-                                    max_k=max_k,
-                                    cap_mode=MODE_NAME_MAP[mode],
+                                forward_instance = (
+                                    FMHA_FORWARD_INSTANCE_TEMPLATE.format(
+                                        extern="extern ",
+                                        mode=mode,
+                                        dtype=TYPE_CTYPE_MAP[dtype],
+                                        has_causalmask=BOOL_MAP[has_causalmask],
+                                        has_bias=BOOL_MAP[has_bias],
+                                        has_dropout=BOOL_MAP[has_dropout],
+                                        max_k=max_k,
+                                        cap_mode=MODE_NAME_MAP[mode],
+                                    )
                                 )
                                 file.write(forward_instance)
 
@@ -255,21 +276,29 @@ def create_backward_instances(instance_dir: Path, headdims: List) -> None:
     for mode in ["batched", "grouped"]:
         for dtype in ["fp16", "bf16"]:
             for has_causalmask in [True, False]:
-                for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
+                for has_bias, has_bias_grad in [
+                    [True, False],
+                    [True, True],
+                    [False, False],
+                ]:
                     for has_dropout in [True, False]:
                         for max_k in headdims:
                             fname = FMHA_BACKWARD_INSTANCE_FNAME.format(
                                 mode=mode,
                                 dtype_str=dtype,
-                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[has_causalmask],
+                                has_or_no_causalmask_str=BOOL_MAP_CAUSALMASK[
+                                    has_causalmask
+                                ],
                                 has_or_no_bias_str=BOOL_MAP_BIAS[has_bias],
                                 has_or_no_biasgrad_str=BOOL_MAP_BIASGRAD[has_bias_grad],
                                 has_or_no_dropout_str=BOOL_MAP_DROPOUT[has_dropout],
                                 max_k_str=INT_MAP_MAX_K[max_k],
                             )
-                            backward_instance_inc = FMHA_BACKWARD_INSTANCE_TEMPLATE_INC.format(
-                                mode=mode,
-                                dtype_file=TYPE_FNAME_MAP[dtype],
+                            backward_instance_inc = (
+                                FMHA_BACKWARD_INSTANCE_TEMPLATE_INC.format(
+                                    mode=mode,
+                                    dtype_file=TYPE_FNAME_MAP[dtype],
+                                )
                             )
                             backward_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
                                 extern="",
@@ -282,7 +311,11 @@ def create_backward_instances(instance_dir: Path, headdims: List) -> None:
                                 max_k=max_k,
                                 cap_mode=MODE_NAME_MAP[mode],
                             )
-                            (instance_dir / fname).write_text(FMHA_COPYRIGHT_HEADER + backward_instance_inc + backward_instance)
+                            (instance_dir / fname).write_text(
+                                FMHA_COPYRIGHT_HEADER
+                                + backward_instance_inc
+                                + backward_instance
+                            )
 
 
 def create_backward_instances_ref(instance_dir: Path, headdims: List) -> None:
@@ -297,23 +330,29 @@ def create_backward_instances_ref(instance_dir: Path, headdims: List) -> None:
                 mode=mode,
                 dtype_file=TYPE_FNAME_MAP[dtype],
             )
-            with open(ref_fname, 'a') as file:
+            with open(ref_fname, "a") as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(backward_instance_inc)
                 for max_k in headdims:
-                    for has_bias, has_bias_grad in [[True, False], [True, True], [False, False]]:
+                    for has_bias, has_bias_grad in [
+                        [True, False],
+                        [True, True],
+                        [False, False],
+                    ]:
                         for has_dropout in [True, False]:
                             for has_causalmask in [True, False]:
-                                backward_instance = FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
-                                    extern="extern ",
-                                    mode=mode,
-                                    dtype=TYPE_CTYPE_MAP[dtype],
-                                    has_causalmask=BOOL_MAP[has_causalmask],
-                                    has_bias=BOOL_MAP[has_bias],
-                                    has_bias_grad=BOOL_MAP[has_bias_grad],
-                                    has_dropout=BOOL_MAP[has_dropout],
-                                    max_k=max_k,
-                                    cap_mode=MODE_NAME_MAP[mode],
+                                backward_instance = (
+                                    FMHA_BACKWARD_INSTANCE_TEMPLATE.format(
+                                        extern="extern ",
+                                        mode=mode,
+                                        dtype=TYPE_CTYPE_MAP[dtype],
+                                        has_causalmask=BOOL_MAP[has_causalmask],
+                                        has_bias=BOOL_MAP[has_bias],
+                                        has_bias_grad=BOOL_MAP[has_bias_grad],
+                                        has_dropout=BOOL_MAP[has_dropout],
+                                        max_k=max_k,
+                                        cap_mode=MODE_NAME_MAP[mode],
+                                    )
                                 )
                                 file.write(backward_instance)
 
diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 47ad90d2f..889eeb446 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -344,7 +344,7 @@ class BwOp(AttentionBwOpBase):
     OPERATOR = get_operator("xformers", "efficient_attention_backward_ck")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES
-    SUPPORTED_MAX_K = 256 
+    SUPPORTED_MAX_K = 256
     SUPPORTED_ATTN_BIAS_TYPES: Iterable[Any] = (
         type(None),
         torch.Tensor,
@@ -369,7 +369,7 @@ class BwOp(AttentionBwOpBase):
         32,  # 64x64 kernel
         64,
         128,  # 64x128/128x128 kernel
-        256, 
+        256,
     ]
 
     @classmethod

From fd82f20b6c7a3b2f30856d48575065e45cd10028 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Aug 2024 19:50:50 +0000
Subject: [PATCH 622/641] apply flake8

---
 xformers/csrc/attention/hip_fmha/generate_instances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index bfbe5f345..d9a276350 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -373,7 +373,7 @@ def create_backward_instances_ref(instance_dir: Path, headdims: List) -> None:
     output_dir = Path(this_dir) / "instances"
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    ## remove existing files in the directory
+    # remove existing files in the directory
     files = os.listdir(output_dir)
     for ff in files:
         file_path = os.path.join(output_dir, ff)

From 7d21800f684e4d654cdec49e10ed545d03a598f9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Aug 2024 20:43:02 +0000
Subject: [PATCH 623/641] fix mypy

---
 tests/test_mem_eff_attention.py | 6 +++---
 xformers/attn_bias_utils.py     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
index ed6d6a696..ad71241ed 100644
--- a/tests/test_mem_eff_attention.py
+++ b/tests/test_mem_eff_attention.py
@@ -37,13 +37,13 @@
 if torch.cuda.is_available():
     compute_capability = torch.cuda.get_device_capability("cuda")
 sm70_or_better_only = pytest.mark.skipif(
-    torch.version.cuda and compute_capability < (7, 0), reason="requires sm70+"
+    torch.version.cuda is not None and compute_capability < (7, 0), reason="requires sm70+"
 )
 sm75_or_better_only = pytest.mark.skipif(
-    torch.version.cuda and compute_capability < (7, 5), reason="requires sm75+"
+    torch.version.cuda is not None and compute_capability < (7, 5), reason="requires sm75+"
 )
 sm80_or_better_only = pytest.mark.skipif(
-    torch.version.cuda and compute_capability < (8, 0), reason="requires sm80+"
+    torch.version.cuda is not None and compute_capability < (8, 0), reason="requires sm80+"
 )
 skip_if_rocm = pytest.mark.skipif(
     torch.version.hip is not None, reason="not supported on ROCm"
diff --git a/xformers/attn_bias_utils.py b/xformers/attn_bias_utils.py
index 224302c4f..fb8d8207f 100644
--- a/xformers/attn_bias_utils.py
+++ b/xformers/attn_bias_utils.py
@@ -39,7 +39,7 @@ def create_attn_bias(
     dtype,
     requires_grad: bool,
     fmt: str,
-    op: Type[AttentionOpBase],
+    op: Optional[Type[AttentionOpBase]] = None,
     page_size: Optional[int] = None,
 ):
     if bias_type is None or isinstance(None, bias_type):
@@ -59,7 +59,7 @@ def create_attn_bias(
                 * 3
             )
             attn_bias = attn_bias.expand(batch_size, num_heads, q_len, kv_len)
-        elif issubclass(op, fmha.triton_splitk.FwOp):
+        elif op is not None and issubclass(op, fmha.triton_splitk.FwOp):
             attn_bias = (
                 torch.randn(
                     (batch_size, num_heads_groups, num_heads, q_len, kv_len),

From d6b64568739952fd95bf4eb172d6fbbdd53964d1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Aug 2024 21:05:42 +0000
Subject: [PATCH 624/641] revert disable flash operator on rocm

---
 xformers/ops/fmha/flash.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
index 14a8335ec..49e708dc2 100644
--- a/xformers/ops/fmha/flash.py
+++ b/xformers/ops/fmha/flash.py
@@ -607,10 +607,7 @@ class FwOp(AttentionFwOpBase):
         implementation.
     """
 
-    if torch.version.hip:
-        OPERATOR = None
-    else:
-        OPERATOR = get_operator("xformers_flash", "flash_fwd")
+    OPERATOR = get_operator("xformers_flash", "flash_fwd")
     SUPPORTED_DEVICES: Set[str] = {"cuda"}
     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
@@ -812,10 +809,7 @@ def operator_flop(
 class BwOp(AttentionBwOpBase):
     __doc__ = FwOp.__doc__
 
-    if torch.version.hip:
-        OPERATOR = None
-    else:
-        OPERATOR = get_operator("xformers_flash", "flash_bwd")
+    OPERATOR = get_operator("xformers_flash", "flash_bwd")
     SUPPORTED_DEVICES = FwOp.SUPPORTED_DEVICES
     CUDA_MINIMUM_COMPUTE_CAPABILITY = FwOp.CUDA_MINIMUM_COMPUTE_CAPABILITY
     SUPPORTED_DTYPES = FwOp.SUPPORTED_DTYPES

From 87188ea85cd3dc900c431acd2bccd6cc6de6d68d Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 16 Aug 2024 22:42:56 +0000
Subject: [PATCH 625/641] Synchronize to ck_tile latest commit again

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 0d79fde5e..6b533bfc9 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 0d79fde5e2bb4009de31a63ce1f8ec1facf4c1cc
+Subproject commit 6b533bfc907a3deaae7338d923649f2a8410a247

From 5be80a3ac93240d14dcbfd91f200f3bcfb78cc85 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 17 Aug 2024 09:27:57 +0000
Subject: [PATCH 626/641] Re-position the composable_kernel submodule to the
 develop branch

---
 .gitmodules                         | 2 +-
 third_party/composable_kernel_tiled | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 18adab4b0..b642ad5b9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = ck_tile/fa_bwd_opt
+	branch = develop 
diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index 6b533bfc9..c8b6b6424 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit 6b533bfc907a3deaae7338d923649f2a8410a247
+Subproject commit c8b6b64240e840a7decf76dfaa13c37da5294c4a

From 2a5c14134bc58cb12079f9723b4697ae563cdf4e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 17 Aug 2024 12:39:18 +0000
Subject: [PATCH 627/641] Avoid the Async pipeline when khasBias is true

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h     | 4 ++--
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 05d654dc3..71f787aa6 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -68,8 +68,8 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
       // determine whether to do padding saving some compiling time
       const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
-      const bool use_async_pipeline =
-          ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+      const bool use_async_pipeline = 
+          (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
       if (!use_async_pipeline) {
         BOOL_SWITCH_3(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index d4a6c9dbd..fd8197831 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -63,7 +63,7 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
       const bool use_async_pipeline =
-          ((param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+          (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
 
       if (!use_async_pipeline) {
         BOOL_SWITCH_2(

From 2874842c06d588ac394b96895359d162bb27b73f Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 17 Aug 2024 14:10:52 +0000
Subject: [PATCH 628/641] clang-format for two files

---
 .../csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h    | 5 +++--
 .../csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h    | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
index 71f787aa6..36cf1b56e 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_batched_infer.h
@@ -68,8 +68,9 @@ struct batched_infer_causalmask_bias_dropout_dispatch {
       // determine whether to do padding saving some compiling time
       const bool pad_headdim = (pad_headdim_q || pad_headdim_v);
 
-      const bool use_async_pipeline = 
-          (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+      const bool use_async_pipeline =
+          (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) &&
+           (MaxK <= 128));
 
       if (!use_async_pipeline) {
         BOOL_SWITCH_3(
diff --git a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
index fd8197831..3805108c1 100644
--- a/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
+++ b/xformers/csrc/attention/hip_fmha/ck_tiled_fmha_grouped_infer.h
@@ -63,7 +63,8 @@ struct grouped_infer_causalmask_bias_dropout_dispatch {
       bool pad_headdim_q = !(param.K % FmhaShape::kK0BlockLength == 0);
       bool pad_headdim_v = !(param.Kv % FmhaShape::kN1 == 0);
       const bool use_async_pipeline =
-          (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) && (MaxK <= 128));
+          (!kHasBias && (param.K % 8 == 0) && (param.Kv % 8 == 0) &&
+           (MaxK <= 128));
 
       if (!use_async_pipeline) {
         BOOL_SWITCH_2(

From 7a91589ced0111a8b15da2610438306981f814e8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Sat, 17 Aug 2024 15:30:29 +0000
Subject: [PATCH 629/641] Change allocation of grouped mode lse from [H, M] to
 [1, H, M] to match the xformers scripts

---
 .../hip_fmha/attention_backward_generic_ck_tiled.cpp      | 8 ++++----
 .../hip_fmha/attention_forward_generic_ck_tiled.cpp       | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index 700adeba5..a1c542177 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -354,8 +354,8 @@ efficient_attention_backward_ck(
     p.max_seqlen_k = *max_seqlen_k_;
 
     // unpadded lse layout required
-    TORCH_CHECK(p.Hq == logsumexp.size(0));
-    TORCH_CHECK(p.M == logsumexp.size(1));
+    TORCH_CHECK(p.Hq == logsumexp.size(1));
+    TORCH_CHECK(p.M == logsumexp.size(2));
 
     if (scale.has_value())
       p.scale = float(*scale);
@@ -384,8 +384,8 @@ efficient_attention_backward_ck(
         static_cast<int>(grad_out.stride(3))};
 
     p.lsed_strides = {
-        static_cast<int>(logsumexp.stride(0)),
-        static_cast<int>(logsumexp.stride(1))};
+        static_cast<int>(logsumexp.stride(1)),
+        static_cast<int>(logsumexp.stride(2))};
 
     if (use_grad_q_f32) {
       p.grad_q_f32_strides = {
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index fa6e0127a..4bbfe71ad 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -316,11 +316,11 @@ efficient_attention_forward_ck(
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
-      logsumexp = at::empty({Hq, M}, opts.dtype(at::kFloat));
+      logsumexp = at::empty({1, Hq, M}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
       p.lse_strides = {
-          static_cast<int>(logsumexp.stride(0)),
-          static_cast<int>(logsumexp.stride(1))};
+          static_cast<int>(logsumexp.stride(1)),
+          static_cast<int>(logsumexp.stride(2))};
     } else {
       p.logsumexp_ptr = nullptr;
       p.lse_strides = {0, 0};

From 66efb2c8181bbf1e94bdcc33fab2d93f66c49638 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 20 Aug 2024 08:46:36 +0000
Subject: [PATCH 630/641] Change in generate_instances.py so that this scripts
 can be called from flexible location

---
 .../csrc/attention/hip_fmha/generate_instances.py     | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/generate_instances.py b/xformers/csrc/attention/hip_fmha/generate_instances.py
index d9a276350..53dd8143c 100644
--- a/xformers/csrc/attention/hip_fmha/generate_instances.py
+++ b/xformers/csrc/attention/hip_fmha/generate_instances.py
@@ -79,7 +79,7 @@
     "{has_or_no_bias_str}_{has_or_no_biasgrad_str}_{has_or_no_dropout_str}_{max_k_str}.cpp"
 )
 
-FMHA_INSTANCE_REF_FNAME = "instances/fmha_{mode}_{function}_{dtype}_instances_ref.h"
+FMHA_INSTANCE_REF_FNAME = "fmha_{mode}_{function}_{dtype}_instances_ref.h"
 
 BOOL_MAP = {True: "true", False: "false"}
 
@@ -174,11 +174,12 @@ def create_infer_instances_ref(instance_dir: Path, headdims: List) -> None:
                 function="infer",
                 dtype=dtype,
             )
+            ref_fname_path = instance_dir / ref_fname
             infer_instance_inc = FMHA_INFER_INSTANCE_TEMPLATE_INC.format(
                 mode=mode,
                 dtype_file=TYPE_FNAME_MAP[dtype],
             )
-            with open(ref_fname, "a") as file:
+            with open(ref_fname_path, "a") as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(infer_instance_inc)
                 for max_k in headdims:
@@ -246,11 +247,12 @@ def create_forward_instances_ref(instance_dir: Path, headdims: List) -> None:
                 function="forward",
                 dtype=dtype,
             )
+            ref_fname_path = instance_dir / ref_fname
             forward_instance_inc = FMHA_FORWARD_INSTANCE_TEMPLATE_INC.format(
                 mode=mode,
                 dtype_file=TYPE_FNAME_MAP[dtype],
             )
-            with open(ref_fname, "a") as file:
+            with open(ref_fname_path, "a") as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(forward_instance_inc)
                 for max_k in headdims:
@@ -326,11 +328,12 @@ def create_backward_instances_ref(instance_dir: Path, headdims: List) -> None:
                 function="backward",
                 dtype=dtype,
             )
+            ref_fname_path = instance_dir / ref_fname
             backward_instance_inc = FMHA_BACKWARD_INSTANCE_TEMPLATE_INC.format(
                 mode=mode,
                 dtype_file=TYPE_FNAME_MAP[dtype],
             )
-            with open(ref_fname, "a") as file:
+            with open(ref_fname_path, "a") as file:
                 file.write(FMHA_COPYRIGHT_HEADER)
                 file.write(backward_instance_inc)
                 for max_k in headdims:

From c19b1f536715ef2400f1bd015559a825431f8b04 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 20 Aug 2024 17:45:27 +0000
Subject: [PATCH 631/641] Add manual for generate_instances.py (.md)

---
 .../attention/hip_fmha/GENERATE_INSTANCES.md  | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md

diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
new file mode 100644
index 000000000..8642facc2
--- /dev/null
+++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
@@ -0,0 +1,35 @@
+# generate\_instances.py 
+
+  generate\_instances.py is a simple tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
+  Without generate\_instances.py, manually writing those instances and references will be laborious and easy to get wrong. 
+
+  The instances generated by this scripts are divided into three categories visible from the scripts: 
+
+   * Infer -- which refers to instances for calling inference-only kernels
+   * Forward -- which refers to instances for calling training forward kernels
+   * Backward -- which refers to instances for calling training backward kernels
+
+  generate\_instances.py is to be used by the HIP fmha developers themselves. It is not supposed to be used by the user/xformers developers for 
+  building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. 
+
+## how to use generate\_instances.py
+
+   * To generate complete instances supported by current implementation
+
+     ```bash
+      #> python xformers/csrc/attention/hip_fmha/generate_instances.py
+     ```
+   
+   * To generate reduced instances (when headdim256 is not required)
+
+     ```bash
+      #> python xformers/csrc/attention/hip_fmha/generate_instances.py --ignore-hd256
+     ```
+   * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required
+
+## where the instances files are located
+
+    The instances files (.cpp) and references files (.h) are always located under a folder `instances` that is located under the same directory
+    as generate\_instances.py itself
+
+     

From b450d01abcfa924c3e131d359afa3688f75e892c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 20 Aug 2024 17:54:43 +0000
Subject: [PATCH 632/641] Modification in GENERATE_INSTANCES.md

---
 xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
index 8642facc2..5f4ed0f90 100644
--- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
+++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
@@ -1,15 +1,15 @@
 # generate\_instances.py 
 
-  generate\_instances.py is a simple tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
+     generate\_instances.py is a simple tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
   Without generate\_instances.py, manually writing those instances and references will be laborious and easy to get wrong. 
 
-  The instances generated by this scripts are divided into three categories visible from the scripts: 
+     The instances generated by this scripts are divided into three categories visible from the scripts: 
 
    * Infer -- which refers to instances for calling inference-only kernels
    * Forward -- which refers to instances for calling training forward kernels
    * Backward -- which refers to instances for calling training backward kernels
 
-  generate\_instances.py is to be used by the HIP fmha developers themselves. It is not supposed to be used by the user/xformers developers for 
+     generate\_instances.py is to be used by the HIP fmha developers themselves. It is not supposed to be used by the user/xformers developers for 
   building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. 
 
 ## how to use generate\_instances.py
@@ -29,7 +29,7 @@
 
 ## where the instances files are located
 
-    The instances files (.cpp) and references files (.h) are always located under a folder `instances` that is located under the same directory
-    as generate\_instances.py itself
+   * The instances files (.cpp) and references files (.h) are always located under a folder `instances` that is located under the same directory
+     as generate\_instances.py itself
 
      
From 07dc8e7e67daa44fb3330c5115ca05a25349f76c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 20 Aug 2024 18:02:11 +0000
Subject: [PATCH 633/641] Fix in GENERATE_INSTANCES.md

---
 .../attention/hip_fmha/GENERATE_INSTANCES.md  | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
index 5f4ed0f90..f4512ffce 100644
--- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
+++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
@@ -1,35 +1,35 @@
-# generate\_instances.py 
 
-     generate\_instances.py is a simple tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
-  Without generate\_instances.py, manually writing those instances and references will be laborious and easy to get wrong. 
+# generate\_instances.py
+
+     The `generate_instances.py` is a simple tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
+  Without this tool, manually writing those instances and references will be laborious and easy to get wrong. 
 
      The instances generated by this scripts are divided into three categories visible from the scripts: 
 
-   * Infer -- which refers to instances for calling inference-only kernels
-   * Forward -- which refers to instances for calling training forward kernels
-   * Backward -- which refers to instances for calling training backward kernels
+   * Infer, which refers to instances for calling inference-only kernels
+   * Forward, which refers to instances for calling training forward kernels
+   * Backward, which refers to instances for calling training backward kernels
 
-     generate\_instances.py is to be used by the HIP fmha developers themselves. It is not supposed to be used by the user/xformers developers for 
+     The `generate_instances.py` is to be used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for 
   building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. 
 
 ## how to use generate\_instances.py
 
    * To generate complete instances supported by current implementation
 
-     ```bash
+     ```
       #> python xformers/csrc/attention/hip_fmha/generate_instances.py
      ```
-   
    * To generate reduced instances (when headdim256 is not required)
 
-     ```bash
+     ``` 
       #> python xformers/csrc/attention/hip_fmha/generate_instances.py --ignore-hd256
      ```
    * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required
 
 ## where the instances files are located
-
-   * The instances files (.cpp) and references files (.h) are always located under a folder `instances` that is located under the same directory
-     as generate\_instances.py itself
+   
+     The instances files (.cpp) and references files (.h) are always located under a folder `instances/` that is located under the same directory
+   as `generate_instances.py` itself
 
      
From 72bf6036c585f33d56e65b62edf4b6e668d6b9b8 Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Tue, 20 Aug 2024 18:41:50 +0800
Subject: [PATCH 634/641] Update GENERATE_INSTANCES.md

---
 .../attention/hip_fmha/GENERATE_INSTANCES.md  | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
index f4512ffce..829df6646 100644
--- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
+++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
@@ -1,19 +1,18 @@
 
-# generate\_instances.py
-
-     The `generate_instances.py` is a simple tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
-  Without this tool, manually writing those instances and references will be laborious and easy to get wrong. 
-
-     The instances generated by this scripts are divided into three categories visible from the scripts: 
-
-   * Infer, which refers to instances for calling inference-only kernels
-   * Forward, which refers to instances for calling training forward kernels
-   * Backward, which refers to instances for calling training backward kernels
-
-     The `generate_instances.py` is to be used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for 
+# Instances generator
+
+  The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
+  Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. 
+  
+  The instances generated by this scripts are divided into three categories visible from the scripts: 
+   * Infer -- which refers to instances for calling inference-only kernels
+   * Forward -- which refers to instances for calling training forward kernels
+   * Backward -- which refers to instances for calling training backward kernels
+     
+  The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for 
   building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. 
 
-## how to use generate\_instances.py
+## how to use instance generator
 
    * To generate complete instances supported by current implementation
 
@@ -28,8 +27,7 @@
    * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required
 
 ## where the instances files are located
-   
-     The instances files (.cpp) and references files (.h) are always located under a folder `instances/` that is located under the same directory
-   as `generate_instances.py` itself
+   The instances files and references files are always located under a folder `instances/` that is located under the same directory
+   as the file `generate_instances.py` itself
 
      
From e397974ef528ce0fa895ae1e2f2fe57c0e0a43ca Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 20 Aug 2024 19:00:12 +0000
Subject: [PATCH 635/641] clean-up commented codes

---
 .../hip_fmha/attention_backward_generic_ck_tiled.cpp      | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index a1c542177..b470f5990 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -544,14 +544,6 @@ efficient_attention_backward_ck(
     grad_v = tmp_grad_v_view.sum(3);
   }
 
-  /*
-    if (inDataType == at::ScalarType::Half)
-      grad_q = grad_q_f32.to(torch::kFloat16);
-
-    if (inDataType == at::ScalarType::BFloat16)
-      grad_q = grad_q_f32.to(torch::kBFloat16);
-  */
-
   return std::make_tuple(grad_q, grad_k, grad_v, grad_bias);
 }
 

From 7a04357fdccfe0b698b0f36754869e0fec6534dd Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 20 Aug 2024 19:18:03 +0000
Subject: [PATCH 636/641] Revert "Change allocation of grouped mode lse from
 [H, M] to [1, H, M] to match the xformers scripts"

This reverts commit 7a91589ced0111a8b15da2610438306981f814e8.
---
 .../hip_fmha/attention_backward_generic_ck_tiled.cpp      | 8 ++++----
 .../hip_fmha/attention_forward_generic_ck_tiled.cpp       | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
index b470f5990..53df9b20a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -354,8 +354,8 @@ efficient_attention_backward_ck(
     p.max_seqlen_k = *max_seqlen_k_;
 
     // unpadded lse layout required
-    TORCH_CHECK(p.Hq == logsumexp.size(1));
-    TORCH_CHECK(p.M == logsumexp.size(2));
+    TORCH_CHECK(p.Hq == logsumexp.size(0));
+    TORCH_CHECK(p.M == logsumexp.size(1));
 
     if (scale.has_value())
       p.scale = float(*scale);
@@ -384,8 +384,8 @@ efficient_attention_backward_ck(
         static_cast<int>(grad_out.stride(3))};
 
     p.lsed_strides = {
-        static_cast<int>(logsumexp.stride(1)),
-        static_cast<int>(logsumexp.stride(2))};
+        static_cast<int>(logsumexp.stride(0)),
+        static_cast<int>(logsumexp.stride(1))};
 
     if (use_grad_q_f32) {
       p.grad_q_f32_strides = {
diff --git a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
index 4bbfe71ad..fa6e0127a 100644
--- a/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
+++ b/xformers/csrc/attention/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -316,11 +316,11 @@ efficient_attention_forward_ck(
       p.dropout_prob = 0.0f;
 
     if (p.compute_logsumexp) {
-      logsumexp = at::empty({1, Hq, M}, opts.dtype(at::kFloat));
+      logsumexp = at::empty({Hq, M}, opts.dtype(at::kFloat));
       p.logsumexp_ptr = logsumexp.data_ptr();
       p.lse_strides = {
-          static_cast<int>(logsumexp.stride(1)),
-          static_cast<int>(logsumexp.stride(2))};
+          static_cast<int>(logsumexp.stride(0)),
+          static_cast<int>(logsumexp.stride(1))};
     } else {
       p.logsumexp_ptr = nullptr;
       p.lse_strides = {0, 0};

From 77a2c249d91b654ddf216ed0a72420e6e9e23a66 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 3 Sep 2024 17:02:03 +0000
Subject: [PATCH 637/641] Synchronize to latest ck develop for using the latest
 RTN bf16 convert

---
 third_party/composable_kernel_tiled | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
index c8b6b6424..73b67f290 160000
--- a/third_party/composable_kernel_tiled
+++ b/third_party/composable_kernel_tiled
@@ -1 +1 @@
-Subproject commit c8b6b64240e840a7decf76dfaa13c37da5294c4a
+Subproject commit 73b67f290f6602fe0461d48a2c103de460f14084

From 4e51efa4cf65a1d4c8df33a044e986cc19c74f2e Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 3 Sep 2024 17:03:31 +0000
Subject: [PATCH 638/641] Add c++ extension compiling options for better
 performance on ROCM 6.2

---
 setup.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6b0d8943d..9f8101809 100644
--- a/setup.py
+++ b/setup.py
@@ -453,7 +453,7 @@ def get_extensions():
         cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
         use_rtn_bf16_convert = os.getenv("ENABLE_HIP_FMHA_RTN_BF16_CONVERT", "0")
         if use_rtn_bf16_convert == "1":
-            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0"]
+            cc_flag += ["-DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3"]
 
         arch_list = os.getenv("HIP_ARCHITECTURES", "native").split()
 
@@ -471,6 +471,12 @@ def get_extensions():
                 "-Woverloaded-virtual",
                 "-mllvm",
                 "-enable-post-misched=0",
+                "-mllvm",
+                "-amdgpu-early-inline-all=true",
+                "-mllvm",
+                "-amdgpu-function-calls=false",
+                "-mllvm",
+                "-greedy-reverse-local-assignment=1"
             ]
             + generator_flag
             + cc_flag,

From 887996a0a42ad77fabeeee30592bcad66d3d2131 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 5 Sep 2024 07:28:42 +0000
Subject: [PATCH 639/641] Use the same rocm_ci.yml as upstream

---
 .github/workflows/rocm_ci.yml | 83 ++++++++++-------------------------
 1 file changed, 23 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index eb5718471..8955e4b07 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -3,28 +3,14 @@ name: rocm-ci
 on: 
   pull_request:
     types: [labeled, synchronize, reopened]
-  workflow_dispatch:
-    inputs:
-      logLevel:
-        description: 'Log level'     
-        required: true
-        default: 'warning'
-  schedule:
-    - cron: "15 1 * * *"
 
 jobs:
   build:
-    runs-on: self-hosted
-    container:
-      image: 'rocm/pytorch-nightly:latest'
-      options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G '
+    if: github.repository == 'rocm/xformers'
+    runs-on: rocm
+
     steps:
-    - uses: actions/checkout@v4
-      with:
-        path: '_xformers'
-        submodules: 'recursive'
-        set-safe-directory: true
-        fetch-depth: 0
+    - uses: actions/checkout@v2
     - name: Get CPU info on Ubuntu
       if: contains(runner.os, 'linux')
       run: |
@@ -49,60 +35,37 @@ jobs:
         export ROCM_PATH=/opt/rocm
         echo ROCM_PATH         = $ROCM_PATH
 
+        export MAX_JOBS=64
+        echo MAX_JOBS          = $MAX_JOBS
+
         hipcc --version
         rocm-smi
         rocminfo | grep "gfx"
-        
-    - name: Setup build env
-      run: |
-        conda create -n xformers python=3.11
-        export PATH=/opt/conda/envs/xformers/bin:$PATH
-        python -VV
-        
-        python -m pip install -U torch --index-url=https://download.pytorch.org/whl/nightly/rocm6.1 
-        python -c "import torch; print(f'PyTorch version {torch.__version__}')"
-        
-        python -m pip install ninja scipy pytest pytest-html
 
-    - name: Pre-build clean
+    - name: Build XFormers
       run: |
-        cd _xformers
-        git clean -ffdx
-        cd ..
+        git clone --recursive -b $GIT_BRANCH $GITHUB_REPOSITORY
+        docker run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G -v $PWD/xformers:/xformers  rocm/pytorch-nightly:latest
 
-    - name: Build xformers
-      run: |
-        export PATH=/opt/conda/envs/xformers/bin:$PATH
-        export MAX_JOBS=144
-        
-        python -m pip install -e ./_xformers --verbose
-        python -m xformers.info
+        pip3 install --upgrade pip
+        pip3 uninstall -y xformers
+        MAX_JOBS=$MAX_JOBS pip3 install -e /xformers --verbose
+        pip3 install scipy==1.10
+
+        python3 -c "import torch; print(torch.__version__)"
+        python3 -m xformers.info
 
     - name: Run python tests
       run: |
-        export PATH=/opt/conda/envs/xformers/bin:$PATH
-        
-        python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py -k "not flshatt"
+        pytest -rpfs /xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
     
     - name: Archive logs
-      if: '!cancelled()'
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
       with:
         name: test results
-        path: test_mem_eff_attention.html
+        path: test_mem_eff_attention_ck.log
 
-    - name: Post-build clean
-      if: '!cancelled()'
+    - name: Process test results
       run: |
-        cd _xformers
-        git clean -ffdx
-        cd ..
-
-  clean:
-    runs-on: self-hosted
-    if: ${{ always() }}
-    needs: [build]
-    steps:
-      - name: Remove dangling Docker images
-        run: |
-          docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi
+        echo "Processing test results TBD"
+      

From 7c06b55aae609a1183044275c9e2aab9225cadfe Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 5 Sep 2024 07:32:32 +0000
Subject: [PATCH 640/641] Use the same ck.py as upstream

---
 xformers/ops/fmha/ck.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/xformers/ops/fmha/ck.py b/xformers/ops/fmha/ck.py
index 88c5cfa6e..f004d2bff 100644
--- a/xformers/ops/fmha/ck.py
+++ b/xformers/ops/fmha/ck.py
@@ -19,7 +19,9 @@
     BlockDiagonalCausalLocalAttentionFromBottomRightMask,
     BlockDiagonalCausalLocalAttentionMask,
     BlockDiagonalCausalMask,
+    BlockDiagonalCausalWithOffsetGappyKeysMask,
     BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    BlockDiagonalGappyKeysMask,
     BlockDiagonalMask,
     LowerTriangularFromBottomRightLocalAttentionMask,
     LowerTriangularFromBottomRightMask,
@@ -154,7 +156,9 @@ class FwOp(AttentionFwOpBase):
         LowerTriangularMaskWithTensorBias,
         BlockDiagonalMask,
         BlockDiagonalCausalMask,
+        BlockDiagonalCausalWithOffsetGappyKeysMask,
         BlockDiagonalCausalWithOffsetPaddedKeysMask,
+        BlockDiagonalGappyKeysMask,
         attn_bias.BlockDiagonalCausalFromBottomRightMask,
         attn_bias.BlockDiagonalCausalLocalAttentionMask,
         BlockDiagonalCausalLocalAttentionFromBottomRightMask,

From 2efa6cd7550505dcbc835156ad98fba455a98493 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 6 Sep 2024 10:30:14 +0000
Subject: [PATCH 641/641] Reformat setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9f8101809..9c3314673 100644
--- a/setup.py
+++ b/setup.py
@@ -476,7 +476,7 @@ def get_extensions():
                 "-mllvm",
                 "-amdgpu-function-calls=false",
                 "-mllvm",
-                "-greedy-reverse-local-assignment=1"
+                "-greedy-reverse-local-assignment=1",
             ]
             + generator_flag
             + cc_flag,